Skip to content

Commit

Permalink
fix: drop detection_class_prob from element metadata (#243)
Browse files Browse the repository at this point in the history
This field was added to the metadata in `unstructured==0.10.13`, and it
was deployed to the hosted api [last
week](https://github.com/Unstructured-IO/unstructured-api/releases/tag/0.0.43).
When users called `partition_via_api` with older versions of the
library, they saw an error related to parsing the new schema. We should
fix backwards compatibility in the library, but in the meantime let's
drop the new field; it's not super relevant to the end user.

Closes #237
  • Loading branch information
awalker4 authored Sep 18, 2023
1 parent 1f01d06 commit 6923a24
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.0.45-dev0

* Drop `detection_class_prob` from the element metadata. This broke backwards compatibility when library users called `partition_via_api`.

## 0.0.44

* Bump unstructured to 0.10.14
Expand Down
6 changes: 5 additions & 1 deletion prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ def pipeline_api(
raise e

# Clean up returned elements
# Note(austin): pydantic should control this sort of thing for us
for i, element in enumerate(elements):
elements[i].metadata.filename = os.path.basename(filename)

Expand All @@ -398,6 +399,9 @@ def pipeline_api(
if element.metadata.file_directory:
elements[i].metadata.file_directory = None

if element.metadata.detection_class_prob:
elements[i].metadata.detection_class_prob = None

if response_type == "text/csv":
df = convert_to_dataframe(elements)
return df.to_csv(index=False)
Expand Down Expand Up @@ -512,7 +516,7 @@ def return_content_type(filename):


@router.post("/general/v0/general")
@router.post("/general/v0.0.44/general")
@router.post("/general/v0.0.45/general")
def pipeline_1(
request: Request,
gz_uncompressed_content_type: Optional[str] = Form(default=None),
Expand Down
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.44
version: 0.0.45
8 changes: 7 additions & 1 deletion test_general/api/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,10 @@ def test_general_api(example_filename, content_type):
assert len(dfs) > 0


def test_coordinates_param():
def test_metadata_fields_removed():
"""
Verify that responses do not include coordinates unless requested
Verify that certain other metadata fields are dropped
"""
client = TestClient(app)
test_file = Path("sample-docs") / "layout-parser-paper-fast.jpg"
Expand All @@ -124,11 +125,16 @@ def test_coordinates_param():
response_with_coords = response.json()

# Each element should be the same except for the coordinates field
# Also, check for metadata fields we explicitly dropped
for i in range(len(response_with_coords)):
assert "coordinates" in response_with_coords[i]["metadata"]
del response_with_coords[i]["metadata"]["coordinates"]
assert response_with_coords[i] == response_without_coords[i]

assert "last_modified" not in response_without_coords[i]["metadata"]
assert "file_directory" not in response_without_coords[i]["metadata"]
assert "detection_class_prob" not in response_without_coords[i]["metadata"]


def test_ocr_languages_param():
"""
Expand Down

0 comments on commit 6923a24

Please sign in to comment.