Skip to content

Commit

Permalink
release: 0.0.82 (#481)
Browse files Browse the repository at this point in the history
Per CHANGELOG update:

## 0.0.82

* Bump to `unstructured` 0.16.11
* No longer attempts to download NLTK asset from S3 which could result
in a 403
  • Loading branch information
cragwolfe authored Dec 13, 2024
1 parent d9afddf commit 9b45894
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 309 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.0.82

* Bump to `unstructured` 0.16.11
* No longer attempts to download NLTK asset from S3 which could result in a 403

## 0.0.81

* Update `strategy` parameter to allow `'` and `"` as input surrounding the value.
Expand Down
2 changes: 1 addition & 1 deletion prepline_general/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
app = FastAPI(
title="Unstructured Pipeline API",
summary="Partition documents with the Unstructured library",
version="0.0.81",
version="0.0.82",
docs_url="/general/docs",
openapi_url="/general/openapi.json",
servers=[
Expand Down
55 changes: 2 additions & 53 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
elements_from_json,
)
from unstructured_inference.models.base import UnknownModelException
from unstructured_inference.models.chipper import MODEL_TYPES as CHIPPER_MODEL_TYPES

app = FastAPI()
router = APIRouter()
Expand Down Expand Up @@ -214,37 +213,6 @@ def partition_pdf_splits(
return results


is_chipper_processing = False


class ChipperMemoryProtection:
"""Chipper calls are expensive, and right now we can only do one call at a time.
If the model is in use, return a 503 error. The API should scale up and the user can try again
on a different server.
"""

def __enter__(self):
global is_chipper_processing
if is_chipper_processing:
# Log here so we can track how often it happens
logger.error("Chipper is already is use")
raise HTTPException(
status_code=503, detail="Server is under heavy load. Please try again later."
)

is_chipper_processing = True

def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
exc_tb: Optional[TracebackType],
):
global is_chipper_processing
is_chipper_processing = False


def pipeline_api(
file: IO[bytes],
request: Request,
Expand Down Expand Up @@ -331,7 +299,6 @@ def pipeline_api(
if file_content_type == "application/pdf":
_check_pdf(file)

hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates)
strategy = _validate_strategy(strategy)
pdf_infer_table_structure = _set_pdf_infer_table_structure(
pdf_infer_table_structure,
Expand Down Expand Up @@ -417,9 +384,6 @@ def pipeline_api(
coordinates=coordinates,
**partition_kwargs, # type: ignore # pyright: ignore[reportGeneralTypeIssues]
)
elif hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES:
with ChipperMemoryProtection():
elements = partition(**partition_kwargs) # type: ignore # pyright: ignore[reportGeneralTypeIssues]
else:
elements = partition(**partition_kwargs) # type: ignore # pyright: ignore[reportGeneralTypeIssues]

Expand Down Expand Up @@ -533,21 +497,6 @@ def _validate_strategy(strategy: str) -> str:
return strategy


def _validate_hi_res_model_name(
hi_res_model_name: Optional[str], show_coordinates: bool
) -> Optional[str]:
# Make sure chipper aliases to the latest model
if hi_res_model_name and hi_res_model_name == "chipper":
hi_res_model_name = "chipperv2"

if hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES and show_coordinates:
raise HTTPException(
status_code=400,
detail=f"coordinates aren't available when using the {hi_res_model_name} model type",
)
return hi_res_model_name


def _validate_chunking_strategy(chunking_strategy: Optional[str]) -> Optional[str]:
"""Raise on `chunking_strategy` is not a valid chunking strategy name.
Expand Down Expand Up @@ -653,7 +602,7 @@ def return_content_type(filename: str):


@router.get("/general/v0/general", include_in_schema=False)
@router.get("/general/v0.0.81/general", include_in_schema=False)
@router.get("/general/v0.0.82/general", include_in_schema=False)
async def handle_invalid_get_request():
raise HTTPException(
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
Expand All @@ -668,7 +617,7 @@ async def handle_invalid_get_request():
description="Description",
operation_id="partition_parameters",
)
@router.post("/general/v0.0.81/general", include_in_schema=False)
@router.post("/general/v0.0.82/general", include_in_schema=False)
def general_partition(
request: Request,
# cannot use annotated type here because of a bug described here:
Expand Down
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.81
version: 0.0.82
Loading

0 comments on commit 9b45894

Please sign in to comment.