Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] Introduce parser supplier support in FileSystemDocumentLoader #1031

Merged
merged 2 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -1,7 +1,7 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.metadata.Metadata;
Expand All @@ -12,6 +12,7 @@
import org.xml.sax.ContentHandler;

import java.io.InputStream;
import java.util.function.Supplier;

import static dev.langchain4j.internal.Utils.getOrDefault;
import static dev.langchain4j.internal.Utils.isNullOrBlank;
Expand All @@ -25,19 +26,23 @@
public class ApacheTikaDocumentParser implements DocumentParser {

private static final int NO_WRITE_LIMIT = -1;
public static final Supplier<Parser> DEFAULT_PARSER_SUPPLIER = AutoDetectParser::new;
public static final Supplier<Metadata> DEFAULT_METADATA_SUPPLIER = Metadata::new;
public static final Supplier<ParseContext> DEFAULT_PARSE_CONTEXT_SUPPLIER = ParseContext::new;
public static final Supplier<ContentHandler> DEFAULT_CONTENT_HANDLER_SUPPLIER = () -> new BodyContentHandler(NO_WRITE_LIMIT);

private final Parser parser;
private final ContentHandler contentHandler;
private final Metadata metadata;
private final ParseContext parseContext;
private final Supplier<Parser> parserSupplier;
private final Supplier<ContentHandler> contentHandlerSupplier;
private final Supplier<Metadata> metadataSupplier;
private final Supplier<ParseContext> parseContextSupplier;

/**
* Creates an instance of an {@code ApacheTikaDocumentParser} with the default Tika components.
* It uses {@link AutoDetectParser}, {@link BodyContentHandler} without write limit,
* empty {@link Metadata} and empty {@link ParseContext}.
*/
public ApacheTikaDocumentParser() {
this(null, null, null, null);
this((Supplier<Parser>) null, null, null, null);
}

/**
Expand All @@ -48,22 +53,50 @@ public ApacheTikaDocumentParser() {
* @param contentHandler Tika content handler. Default: {@link BodyContentHandler} without write limit
* @param metadata Tika metadata. Default: empty {@link Metadata}
* @param parseContext Tika parse context. Default: empty {@link ParseContext}
* @deprecated Use the constructor with suppliers for Tika components if you intend to use this parser for multiple files.
*/
@Deprecated
public ApacheTikaDocumentParser(Parser parser,
ContentHandler contentHandler,
Metadata metadata,
ParseContext parseContext) {
this.parser = getOrDefault(parser, AutoDetectParser::new);
this.contentHandler = getOrDefault(contentHandler, () -> new BodyContentHandler(NO_WRITE_LIMIT));
this.metadata = getOrDefault(metadata, Metadata::new);
this.parseContext = getOrDefault(parseContext, ParseContext::new);
this(
() -> getOrDefault(parser, DEFAULT_PARSER_SUPPLIER),
() -> getOrDefault(contentHandler, DEFAULT_CONTENT_HANDLER_SUPPLIER),
() -> getOrDefault(metadata, DEFAULT_METADATA_SUPPLIER),
() -> getOrDefault(parseContext, DEFAULT_PARSE_CONTEXT_SUPPLIER)
);
}

/**
* Creates an instance of an {@code ApacheTikaDocumentParser} with the provided suppliers for Tika components.
* If some of the suppliers are not provided ({@code null}), the defaults will be used.
*
* @param parserSupplier Supplier for Tika parser to use. Default: {@link AutoDetectParser}
* @param contentHandlerSupplier Supplier for Tika content handler. Default: {@link BodyContentHandler} without write limit
* @param metadataSupplier Supplier for Tika metadata. Default: empty {@link Metadata}
* @param parseContextSupplier Supplier for Tika parse context. Default: empty {@link ParseContext}
*/
public ApacheTikaDocumentParser(Supplier<Parser> parserSupplier,
Supplier<ContentHandler> contentHandlerSupplier,
Supplier<Metadata> metadataSupplier,
Supplier<ParseContext> parseContextSupplier) {
this.parserSupplier = getOrDefault(parserSupplier, () -> DEFAULT_PARSER_SUPPLIER);
this.contentHandlerSupplier = getOrDefault(contentHandlerSupplier, () -> DEFAULT_CONTENT_HANDLER_SUPPLIER);
this.metadataSupplier = getOrDefault(metadataSupplier, () -> DEFAULT_METADATA_SUPPLIER);
this.parseContextSupplier = getOrDefault(parseContextSupplier, () -> DEFAULT_PARSE_CONTEXT_SUPPLIER);
}

// TODO allow automatically extract metadata (e.g. creator, last-author, created/modified timestamp, etc)

@Override
public Document parse(InputStream inputStream) {
try {
Parser parser = parserSupplier.get();
ContentHandler contentHandler = contentHandlerSupplier.get();
Metadata metadata = metadataSupplier.get();
ParseContext parseContext = parseContextSupplier.get();

parser.parse(inputStream, contentHandler, metadata, parseContext);
String text = contentHandler.toString();

Expand Down
@@ -1,9 +1,10 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.parser.AutoDetectParser;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

Expand Down Expand Up @@ -40,7 +41,7 @@ void should_parse_doc_ppt_and_pdf_files(String fileName) {
})
void should_parse_xls_files(String fileName) {

DocumentParser parser = new ApacheTikaDocumentParser(new AutoDetectParser(), null, null, null);
DocumentParser parser = new ApacheTikaDocumentParser(AutoDetectParser::new, null, null, null);
langchain4j marked this conversation as resolved.
Show resolved Hide resolved
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);

Document document = parser.parse(inputStream);
Expand All @@ -50,6 +51,24 @@ void should_parse_xls_files(String fileName) {
assertThat(document.metadata().asMap()).isEmpty();
}

@Test
void should_parse_files_stateless() {

DocumentParser parser = new ApacheTikaDocumentParser();
InputStream inputStream1 = getClass().getClassLoader().getResourceAsStream("test-file.xls");
InputStream inputStream2 = getClass().getClassLoader().getResourceAsStream("test-file.xls");

Document document1 = parser.parse(inputStream1);
Document document2 = parser.parse(inputStream2);

assertThat(document1.text())
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document2.text())
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document1.metadata().asMap()).isEmpty();
assertThat(document2.metadata().asMap()).isEmpty();
}

@ParameterizedTest
@ValueSource(strings = {
"empty-file.txt",
Expand Down