Skip to content

Commit

Permalink
refactor: partition_pdf() pass kwargs through fast strategy pip…
Browse files Browse the repository at this point in the history
…eline (#3040)

This PR aims to pass `kwargs` through `fast` strategy pipeline, which
was missing as part of the previous PR -
#3030.
I also did some code refactoring in this PR, so I recommend reviewing
this PR commit by commit.

### Summary
- pass `kwargs` through `fast` strategy pipeline, which will allow users
to specify additional params like `sort_mode`
- refactor: code reorganization
- cut a release for `0.14.0`
### Testing
CI should pass
  • Loading branch information
christinestraub committed May 17, 2024
1 parent 9cd0e70 commit 76831f1
Show file tree
Hide file tree
Showing 6 changed files with 320 additions and 313 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.0-dev15
## 0.14.0

### BREAKING CHANGES

Expand Down
20 changes: 10 additions & 10 deletions test_unstructured/partition/pdf_image/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def test_partition_image_metadata_date(
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(filename=filename)
Expand All @@ -340,7 +340,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date(
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.HI_RES)
Expand All @@ -356,7 +356,7 @@ def test_partition_image_metadata_date_custom_metadata_date(
expected_last_modification_date = "2009-07-05T09:24:28"

mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(
Expand All @@ -375,7 +375,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date
expected_last_modification_date = "2009-07-05T09:24:28"

mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(
Expand All @@ -393,7 +393,7 @@ def test_partition_image_from_file_metadata_date(
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
Expand All @@ -408,7 +408,7 @@ def test_partition_image_from_file_explicit_get_metadata_date(
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
Expand All @@ -423,7 +423,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date(
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)

Expand All @@ -439,7 +439,7 @@ def test_partition_image_from_file_with_hi_res_strategy_explicit_get_metadata_da
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)

Expand All @@ -459,7 +459,7 @@ def test_partition_image_from_file_metadata_date_custom_metadata_date(
expected_last_modification_date = "2009-07-05T09:24:28"

mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
Expand All @@ -479,7 +479,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
expected_last_modification_date = "2009-07-05T09:24:28"

mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
Expand Down
4 changes: 2 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,11 +715,11 @@ def test_partition_pdf_metadata_date(
expected_last_modification_date = None

mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
return_value=mocked_last_modification_date,
)

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.0-dev15" # pragma: no cover
__version__ = "0.14.0" # pragma: no cover

0 comments on commit 76831f1

Please sign in to comment.