run-llama · rjzhb · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py
@@ -4,6 +4,8 @@
 import logging
 import mimetypes
 import multiprocessing
+import tempfile
+import magic
 import warnings
 from datetime import datetime
 from functools import reduce
@@ -20,6 +22,38 @@
 from tqdm import tqdm
 
 
+def _try_loading_file_extension_by_mime_type() -> Dict[str, str]:
+    """
+    Returns a dictionary mapping MIME types to their corresponding file extensions.
+    Attempts to import the 'magic' module, which is used for file type identification.
+    """
+    try:
+        import magic
+    except ImportError:
+        raise ImportError("The 'magic' module is not installed. Please install it to enable MIME type detection.")
+
+    mime_to_extension = {
+        'application/pdf': '.pdf',
+        'image/jpeg': '.jpg',
+        'image/png': '.png',
+        'text/plain': '.txt',
+        'text/csv': '.csv',
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
+        'application/vnd.ms-powerpoint': '.ppt',
+        'application/vnd.ms-powerpoint.presentation.macroenabled.12': '.pptm',
+        'application/vnd.hwp': '.hwp',
+        'application/epub+zip': '.epub',
+        'text/markdown': '.md',
+        'application/mbox': '.mbox',
+        'application/x-ipynb+json': '.ipynb',
+        'audio/mpeg': '.mp3',
+        'video/mp4': '.mp4',
+        'image/jpeg': '.jpeg'  # This entry will take precedence over the previous '.jpg' entry for 'image/jpeg'
+    }
+    return mime_to_extension
+
+
 def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
     try:
         from llama_index.readers.file import (
@@ -172,6 +206,7 @@ class SimpleDirectoryReader(BaseReader):
     """
 
     supported_suffix_fn: Callable = _try_loading_included_file_formats
+    mime_types_fn: Callable = _try_loading_file_extension_by_mime_type
 
     def __init__(
         self,
@@ -642,3 +677,34 @@ def iter_data(
 
             if len(documents) > 0:
                 yield documents
+
+
+    @staticmethod
+    def load_file_from_binary(
+            binary_data,
+            encoding: str = "utf-8",
+            errors: str = "ignore",
+            raise_on_error: bool = False,
+    ):
+        default_mime_types_map = SimpleDirectoryReader.mime_types_fn()
+        documents: List[Document] = []
+
+        # use magic to get MIME type from binary data
+        mime_type = magic.from_buffer(binary_data, mime=True)
+        file_suffix = default_mime_types_map.get(mime_type, '.bin')
+
+        try:
+            # save a tempfile
+            with tempfile.NamedTemporaryFile(suffix=file_suffix, delete=False) as temp_file:
+                temp_file.write(binary_data)
+                temp_file.flush()
+                temp_filename = temp_file.name
+
+            documents = SimpleDirectoryReader.load_file(Path(temp_filename), None, {})
+
+        finally:
+            # Ensure the temporary file is deleted
+            if os.path.exists(temp_filename):
+                os.remove(temp_filename)
+
+        return documents
diff --git a/llama-index-core/poetry.lock b/llama-index-core/poetry.lock
diff --git a/llama-index-core/pyproject.toml b/llama-index-core/pyproject.toml
@@ -53,6 +53,7 @@ deprecated = ">=1.2.9.3"
 fsspec = ">=2023.5.0"
 httpx = "*"
 langchain = {optional = true, version = ">=0.0.303"}
+python-magic = "^0.4.22"  # Added Python magic package
 nest-asyncio = "^1.5.8"
 nltk = "^3.8.1"
 numpy = "*"
@@ -85,6 +86,7 @@ pillow = ">=9.0.0"
 PyYAML = ">=6.0.1"
 llamaindex-py-client = "^0.1.18"
 wrapt = "*"
+reportlab = "*"
 
 [tool.poetry.extras]
 gradientai = [

diff --git a/llama-index-core/tests/readers/test_load_reader.py b/llama-index-core/tests/readers/test_load_reader.py
@@ -1,8 +1,13 @@
+import io
+
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
 from typing import cast
 
+from llama_index.core import SimpleDirectoryReader
 from llama_index.core.readers.loading import load_reader
 from llama_index.core.readers.string_iterable import StringIterableReader
-
+import magic
 
 def test_loading_readers() -> None:
     string_iterable = StringIterableReader()
@@ -14,3 +19,46 @@ def test_loading_readers() -> None:
     )
 
     assert loaded_string_iterable.is_remote == string_iterable.is_remote
+
+def test_load_binary_data_file():
+    # Create a BytesIO object to store the PDF data in memory
+    pdf_bytes = io.BytesIO()
+
+    # Create a PDF canvas
+    c = canvas.Canvas(pdf_bytes, pagesize=letter)
+
+    # Add text content to the PDF
+    c.drawString(100, 750, "Hello, this is a PDF file.")
+
+    # Close the PDF canvas
+    c.save()
+
+    # Reset the position pointer of the BytesIO object to the beginning
+    pdf_bytes.seek(0)
+
+    # Read the binary data of the PDF
+    pdf_data = pdf_bytes.read()
+
+
+    # Mock the MIME type identification to return 'application/pdf'
+    magic.from_buffer = lambda x, mime: 'application/pdf'
+
+    # Call the function under test
+    documents = SimpleDirectoryReader.load_file_from_binary(pdf_data)
+
+    # Assert that the document contains correct text
+    assert documents[0].text == "Hello, this is a PDF file.\n"
+    assert len(documents) == 1
+
+def test_load_unsupported_binary_data_file_type():
+    # Create binary data for a non-text type that is not supported
+    binary_data = b'\x00\x01\x02\x03\x04'
+    # Mock the MIME type identification to return an unsupported type
+    magic.from_buffer = lambda x, mime: 'application/octet-stream'
+
+    # Call your function, which should try to decode as text
+    documents = SimpleDirectoryReader.load_file_from_binary(binary_data)
+
+    # Assert documents are attempted to be created as text (may result in gibberish or empty)
+    assert len(documents) == 1
+    assert len(documents[0].text) >= 0