Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature Request]: Read binary data (bytes) #12878 #13017

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
66 changes: 66 additions & 0 deletions llama-index-core/llama_index/core/readers/file/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import logging
import mimetypes
import multiprocessing
import tempfile
import magic
import warnings
from datetime import datetime
from functools import reduce
Expand All @@ -20,6 +22,38 @@
from tqdm import tqdm


def _try_loading_file_extension_by_mime_type() -> Dict[str, str]:
"""
Returns a dictionary mapping MIME types to their corresponding file extensions.
Attempts to import the 'magic' module, which is used for file type identification.
"""
try:
import magic
except ImportError:
raise ImportError("The 'magic' module is not installed. Please install it to enable MIME type detection.")

mime_to_extension = {
'application/pdf': '.pdf',
'image/jpeg': '.jpg',
'image/png': '.png',
'text/plain': '.txt',
'text/csv': '.csv',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
'application/vnd.ms-powerpoint': '.ppt',
'application/vnd.ms-powerpoint.presentation.macroenabled.12': '.pptm',
'application/vnd.hwp': '.hwp',
'application/epub+zip': '.epub',
'text/markdown': '.md',
'application/mbox': '.mbox',
'application/x-ipynb+json': '.ipynb',
'audio/mpeg': '.mp3',
'video/mp4': '.mp4',
'image/jpeg': '.jpeg' # This entry will take precedence over the previous '.jpg' entry for 'image/jpeg'
}
return mime_to_extension


def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
try:
from llama_index.readers.file import (
Expand Down Expand Up @@ -172,6 +206,7 @@ class SimpleDirectoryReader(BaseReader):
"""

supported_suffix_fn: Callable = _try_loading_included_file_formats
mime_types_fn: Callable = _try_loading_file_extension_by_mime_type

def __init__(
self,
Expand Down Expand Up @@ -642,3 +677,34 @@ def iter_data(

if len(documents) > 0:
yield documents


@staticmethod
def load_file_from_binary(
binary_data,
encoding: str = "utf-8",
errors: str = "ignore",
raise_on_error: bool = False,
):
default_mime_types_map = SimpleDirectoryReader.mime_types_fn()
documents: List[Document] = []

# use magic to get MIME type from binary data
mime_type = magic.from_buffer(binary_data, mime=True)
file_suffix = default_mime_types_map.get(mime_type, '.bin')

try:
# save a tempfile
with tempfile.NamedTemporaryFile(suffix=file_suffix, delete=False) as temp_file:
temp_file.write(binary_data)
temp_file.flush()
temp_filename = temp_file.name

documents = SimpleDirectoryReader.load_file(Path(temp_filename), None, {})

finally:
# Ensure the temporary file is deleted
if os.path.exists(temp_filename):
os.remove(temp_filename)

return documents
50 changes: 48 additions & 2 deletions llama-index-core/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions llama-index-core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ deprecated = ">=1.2.9.3"
fsspec = ">=2023.5.0"
httpx = "*"
langchain = {optional = true, version = ">=0.0.303"}
python-magic = "^0.4.22" # Added Python magic package
nest-asyncio = "^1.5.8"
nltk = "^3.8.1"
numpy = "*"
Expand Down Expand Up @@ -85,6 +86,7 @@ pillow = ">=9.0.0"
PyYAML = ">=6.0.1"
llamaindex-py-client = "^0.1.18"
wrapt = "*"
reportlab = "*"

[tool.poetry.extras]
gradientai = [
Expand Down
50 changes: 49 additions & 1 deletion llama-index-core/tests/readers/test_load_reader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import io

from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from typing import cast

from llama_index.core import SimpleDirectoryReader
from llama_index.core.readers.loading import load_reader
from llama_index.core.readers.string_iterable import StringIterableReader

import magic

def test_loading_readers() -> None:
string_iterable = StringIterableReader()
Expand All @@ -14,3 +19,46 @@ def test_loading_readers() -> None:
)

assert loaded_string_iterable.is_remote == string_iterable.is_remote

def test_load_binary_data_file():
# Create a BytesIO object to store the PDF data in memory
pdf_bytes = io.BytesIO()

# Create a PDF canvas
c = canvas.Canvas(pdf_bytes, pagesize=letter)

# Add text content to the PDF
c.drawString(100, 750, "Hello, this is a PDF file.")

# Close the PDF canvas
c.save()

# Reset the position pointer of the BytesIO object to the beginning
pdf_bytes.seek(0)

# Read the binary data of the PDF
pdf_data = pdf_bytes.read()


# Mock the MIME type identification to return 'application/pdf'
magic.from_buffer = lambda x, mime: 'application/pdf'

# Call the function under test
documents = SimpleDirectoryReader.load_file_from_binary(pdf_data)

# Assert that the document contains correct text
assert documents[0].text == "Hello, this is a PDF file.\n"
assert len(documents) == 1

def test_load_unsupported_binary_data_file_type():
# Create binary data for a non-text type that is not supported
binary_data = b'\x00\x01\x02\x03\x04'
# Mock the MIME type identification to return an unsupported type
magic.from_buffer = lambda x, mime: 'application/octet-stream'

# Call your function, which should try to decode as text
documents = SimpleDirectoryReader.load_file_from_binary(binary_data)

# Assert documents are attempted to be created as text (may result in gibberish or empty)
assert len(documents) == 1
assert len(documents[0].text) >= 0