From 1f23beb85177422e02a42c0c94f9429d77e4e794 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 12 Feb 2025 14:58:07 -0500 Subject: [PATCH 1/5] set constraint on transforers --- requirements/common/constraints.txt | 3 + requirements/embed/huggingface.txt | 82 +++++---- requirements/local_partition/image.txt | 244 ++++++++----------------- requirements/local_partition/pdf.txt | 244 ++++++++----------------- 4 files changed, 191 insertions(+), 382 deletions(-) diff --git a/requirements/common/constraints.txt b/requirements/common/constraints.txt index b07b59f56..bcad8898a 100644 --- a/requirements/common/constraints.txt +++ b/requirements/common/constraints.txt @@ -32,3 +32,6 @@ deltalake<=0.22.0 # TODO: investigate breaking changed introdced in lancedb>0.15.0 lancedb<=0.15.0 + +# Vulnerability found in versions < 4.48.0 +transformers>=4.48.0 diff --git a/requirements/embed/huggingface.txt b/requirements/embed/huggingface.txt index 8b7ecf9f4..594267eba 100644 --- a/requirements/embed/huggingface.txt +++ b/requirements/embed/huggingface.txt @@ -1,90 +1,92 @@ # This file was autogenerated by uv via the following command: # uv pip compile ./embed/huggingface.in --output-file ./embed/huggingface.txt --no-strip-extras --python-version 3.9 +boto3==1.34.131 + # via pytorch-transformers +botocore==1.34.131 + # via + # -c ./embed/../common/constraints.txt + # boto3 + # s3transfer certifi==2025.1.31 # via requests charset-normalizer==3.4.1 # via requests +click==8.1.8 + # via nltk filelock==3.17.0 - # via - # huggingface-hub - # torch - # transformers + # via torch fsspec==2024.5.0 # via # -c ./embed/../common/constraints.txt - # huggingface-hub # torch -huggingface-hub==0.28.1 - # via - # sentence-transformers - # tokenizers - # transformers idna==3.10 # via requests jinja2==3.1.5 # via torch +jmespath==1.0.1 + # via + # boto3 + # botocore joblib==1.4.2 - # via scikit-learn + # via + # nltk + # scikit-learn markupsafe==3.0.2 # via jinja2 mpmath==1.3.0 # via sympy networkx==3.2.1 # via torch +nltk==3.9.1 + # via sentence-transformers numpy==1.26.4 # via # -c ./embed/../common/constraints.txt + # pytorch-transformers # scikit-learn # scipy - # transformers -packaging==24.2 - # via - # huggingface-hub - # transformers -pillow==11.1.0 + # sentence-transformers +python-dateutil==2.9.0.post0 + # via botocore +pytorch-transformers==1.1.0 # via sentence-transformers -pyyaml==6.0.2 - # via - # huggingface-hub - # transformers regex==2024.11.6 - # via transformers -requests==2.32.3 # via - # huggingface-hub - # transformers -safetensors==0.5.2 - # via transformers + # nltk + # pytorch-transformers +requests==2.32.3 + # via pytorch-transformers +s3transfer==0.10.4 + # via boto3 scikit-learn==1.6.1 # via sentence-transformers scipy==1.13.1 # via # scikit-learn # sentence-transformers -sentence-transformers==3.4.1 +sentence-transformers==0.2.3 # via -r ./embed/huggingface.in +sentencepiece==0.2.0 + # via pytorch-transformers +six==1.17.0 + # via python-dateutil sympy==1.13.1 # via torch threadpoolctl==3.5.0 # via scikit-learn -tokenizers==0.19.1 - # via - # -c ./embed/../common/constraints.txt - # transformers torch==2.6.0 - # via sentence-transformers + # via + # pytorch-transformers + # sentence-transformers tqdm==4.67.1 # via - # huggingface-hub + # nltk + # pytorch-transformers # sentence-transformers - # transformers -transformers==4.44.2 - # via sentence-transformers typing-extensions==4.12.2 - # via - # huggingface-hub - # torch + # via torch urllib3==1.26.20 # via # -c ./embed/../common/constraints.txt + # botocore # requests diff --git a/requirements/local_partition/image.txt b/requirements/local_partition/image.txt index bf3bfce02..f97a90b64 100644 --- a/requirements/local_partition/image.txt +++ b/requirements/local_partition/image.txt @@ -1,28 +1,21 @@ # This file was autogenerated by uv via the following command: # uv pip compile ./local_partition/image.in --output-file ./local_partition/image.txt --no-strip-extras --python-version 3.9 -aiofiles==24.1.0 - # via unstructured-client annotated-types==0.7.0 # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf anyio==4.8.0 - # via httpx -backoff==2.2.1 - # via unstructured + # via starlette beautifulsoup4==4.13.3 # via unstructured -cachetools==5.5.1 - # via google-auth certifi==2025.1.31 - # via - # httpcore - # httpx - # requests + # via requests cffi==1.17.1 # via cryptography chardet==5.2.0 - # via unstructured + # via + # pdfplumber + # unstructured charset-normalizer==3.4.1 # via # pdfminer-six @@ -30,34 +23,29 @@ charset-normalizer==3.4.1 click==8.1.8 # via # nltk - # python-oxmsg + # uvicorn coloredlogs==15.0.1 # via onnxruntime contourpy==1.3.0 # via matplotlib cryptography==44.0.1 - # via - # pdfminer-six - # unstructured-client + # via pdfminer-six cycler==0.12.1 # via matplotlib dataclasses-json==0.6.7 # via unstructured -deprecated==1.2.18 - # via pikepdf effdet==0.4.1 - # via unstructured + # via layoutparser emoji==2.14.1 # via unstructured -eval-type-backport==0.2.2 - # via unstructured-client exceptiongroup==1.2.2 # via anyio +fastapi==0.115.8 + # via unstructured-inference filelock==3.17.0 # via # huggingface-hub # torch - # transformers filetype==1.2.0 # via unstructured flatbuffers==25.2.10 @@ -69,76 +57,44 @@ fsspec==2024.5.0 # -c ./local_partition/../common/constraints.txt # huggingface-hub # torch -google-api-core[grpc]==2.24.1 - # via google-cloud-vision -google-auth==2.38.0 - # via - # google-api-core - # google-cloud-vision -google-cloud-vision==3.9.0 - # via unstructured -googleapis-common-protos==1.66.0 - # via - # google-api-core - # grpcio-status -grpcio==1.70.0 - # via - # -c ./local_partition/../common/constraints.txt - # google-api-core - # grpcio-status -grpcio-status==1.62.3 - # via google-api-core h11==0.14.0 - # via httpcore -html5lib==1.1 - # via unstructured -httpcore==1.0.7 - # via httpx -httpx==0.28.1 - # via unstructured-client + # via uvicorn huggingface-hub==0.28.1 # via # timm - # tokenizers - # transformers # unstructured-inference humanfriendly==10.0 # via coloredlogs idna==3.10 # via # anyio - # httpx # requests importlib-resources==6.5.2 # via matplotlib +iopath==0.1.10 + # via layoutparser jinja2==3.1.5 # via torch joblib==1.4.2 # via nltk -jsonpath-python==1.0.6 - # via unstructured-client +jsons==1.6.3 + # via unstructured-inference kiwisolver==1.4.7 # via matplotlib -langdetect==1.0.9 - # via unstructured +layoutparser[layoutmodels, tesseract]==0.3.4 + # via unstructured-inference lxml==5.3.1 - # via - # pikepdf - # unstructured + # via unstructured markupsafe==3.0.2 # via jinja2 marshmallow==3.26.1 # via dataclasses-json matplotlib==3.9.4 - # via - # pycocotools - # unstructured-inference + # via pycocotools mpmath==1.3.0 # via sympy mypy-extensions==1.0.0 # via typing-inspect -nest-asyncio==1.6.0 - # via unstructured-client networkx==3.2.1 # via torch nltk==3.9.1 @@ -147,206 +103,155 @@ numpy==1.26.4 # via # -c ./local_partition/../common/constraints.txt # contourpy + # layoutparser # matplotlib - # onnx # onnxruntime # opencv-python # pandas # pycocotools # scipy # torchvision - # transformers - # unstructured - # unstructured-inference -olefile==0.47 - # via python-oxmsg omegaconf==2.3.0 # via effdet -onnx==1.17.0 - # via - # unstructured - # unstructured-inference onnxruntime==1.19.2 # via unstructured-inference -opencv-python==4.11.0.86 - # via unstructured-inference +opencv-python==4.6.0.66 + # via + # layoutparser + # unstructured-inference packaging==24.2 # via # huggingface-hub # marshmallow # matplotlib # onnxruntime - # pikepdf - # transformers + # pytesseract # unstructured-pytesseract pandas==2.2.3 - # via unstructured-inference + # via layoutparser pdf2image==1.17.0 - # via unstructured + # via + # layoutparser + # unstructured pdfminer-six==20240706 # via + # pdfplumber # unstructured - # unstructured-inference -pi-heif==0.21.0 - # via unstructured -pikepdf==9.5.2 - # via unstructured +pdfplumber==0.5.3 + # via layoutparser pillow==11.1.0 # via + # layoutparser # matplotlib # pdf2image - # pi-heif - # pikepdf + # pdfplumber + # pytesseract # torchvision # unstructured-pytesseract -proto-plus==1.26.0 - # via - # google-api-core - # google-cloud-vision +portalocker==3.1.1 + # via iopath protobuf==4.23.4 # via # -c ./local_partition/../common/constraints.txt - # google-api-core - # google-cloud-vision - # googleapis-common-protos - # grpcio-status - # onnx # onnxruntime - # proto-plus -psutil==6.1.1 - # via unstructured -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth pycocotools==2.0.8 # via effdet pycparser==2.22 # via cffi +pycrypto==2.6.1 + # via pdfplumber pydantic==2.10.6 - # via unstructured-client + # via fastapi pydantic-core==2.27.2 # via pydantic pyparsing==3.2.1 # via matplotlib -pypdf==5.3.0 - # via - # unstructured - # unstructured-client -pypdfium2==4.30.1 - # via unstructured-inference +pytesseract==0.3.13 + # via layoutparser python-dateutil==2.9.0.post0 # via # matplotlib # pandas - # unstructured-client python-iso639==2025.2.8 # via unstructured python-magic==0.4.27 # via unstructured python-multipart==0.0.20 # via unstructured-inference -python-oxmsg==0.0.2 - # via unstructured pytz==2025.1 # via pandas pyyaml==6.0.2 # via # huggingface-hub + # layoutparser # omegaconf # timm - # transformers -rapidfuzz==3.12.1 - # via - # unstructured - # unstructured-inference regex==2024.11.6 - # via - # nltk - # transformers + # via nltk requests==2.32.3 # via - # google-api-core # huggingface-hub - # requests-toolbelt - # transformers # unstructured -requests-toolbelt==1.0.0 - # via unstructured-client -rsa==4.9 - # via google-auth safetensors==0.5.2 - # via - # timm - # transformers + # via timm scipy==1.13.1 - # via unstructured-inference + # via layoutparser six==1.17.0 - # via - # html5lib - # langdetect - # python-dateutil + # via python-dateutil sniffio==1.3.1 # via anyio soupsieve==2.6 # via beautifulsoup4 -sympy==1.13.1 +starlette==0.45.3 + # via fastapi +sympy==1.13.3 # via # onnxruntime # torch +tabulate==0.9.0 + # via unstructured timm==1.0.14 + # via effdet +torch==2.4.1 # via # effdet - # unstructured-inference -tokenizers==0.19.1 - # via - # -c ./local_partition/../common/constraints.txt - # transformers -torch==2.6.0 - # via - # effdet + # layoutparser # timm # torchvision - # unstructured-inference -torchvision==0.21.0 +torchvision==0.19.1 # via # effdet + # layoutparser # timm tqdm==4.67.1 # via # huggingface-hub + # iopath # nltk - # transformers - # unstructured -transformers==4.44.2 - # via unstructured-inference typing-extensions==4.12.2 # via # anyio # beautifulsoup4 + # fastapi # huggingface-hub + # iopath # pydantic # pydantic-core - # pypdf - # python-oxmsg + # starlette # torch # typing-inspect - # unstructured + # uvicorn typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client + # via dataclasses-json +typish==1.9.3 + # via jsons tzdata==2025.1 # via pandas -unstructured[image]==0.16.20 +unicodecsv==0.14.1 + # via pdfplumber +unstructured[image]==0.10.16 # via -r ./local_partition/image.in -unstructured-client==0.29.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -unstructured-inference==0.8.7 +unstructured-inference==0.2.5 # via unstructured unstructured-pytesseract==0.3.13 # via unstructured @@ -354,12 +259,9 @@ urllib3==1.26.20 # via # -c ./local_partition/../common/constraints.txt # requests -webencodings==0.5.1 - # via html5lib -wrapt==1.17.2 - # via - # -c ./local_partition/../common/constraints.txt - # deprecated - # unstructured +uvicorn==0.34.0 + # via unstructured-inference +wand==0.6.13 + # via pdfplumber zipp==3.21.0 # via importlib-resources diff --git a/requirements/local_partition/pdf.txt b/requirements/local_partition/pdf.txt index 3b21d01b5..9c7643205 100644 --- a/requirements/local_partition/pdf.txt +++ b/requirements/local_partition/pdf.txt @@ -1,28 +1,21 @@ # This file was autogenerated by uv via the following command: # uv pip compile ./local_partition/pdf.in --output-file ./local_partition/pdf.txt --no-strip-extras --python-version 3.9 -aiofiles==24.1.0 - # via unstructured-client annotated-types==0.7.0 # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf anyio==4.8.0 - # via httpx -backoff==2.2.1 - # via unstructured + # via starlette beautifulsoup4==4.13.3 # via unstructured -cachetools==5.5.1 - # via google-auth certifi==2025.1.31 - # via - # httpcore - # httpx - # requests + # via requests cffi==1.17.1 # via cryptography chardet==5.2.0 - # via unstructured + # via + # pdfplumber + # unstructured charset-normalizer==3.4.1 # via # pdfminer-six @@ -30,34 +23,29 @@ charset-normalizer==3.4.1 click==8.1.8 # via # nltk - # python-oxmsg + # uvicorn coloredlogs==15.0.1 # via onnxruntime contourpy==1.3.0 # via matplotlib cryptography==44.0.1 - # via - # pdfminer-six - # unstructured-client + # via pdfminer-six cycler==0.12.1 # via matplotlib dataclasses-json==0.6.7 # via unstructured -deprecated==1.2.18 - # via pikepdf effdet==0.4.1 - # via unstructured + # via layoutparser emoji==2.14.1 # via unstructured -eval-type-backport==0.2.2 - # via unstructured-client exceptiongroup==1.2.2 # via anyio +fastapi==0.115.8 + # via unstructured-inference filelock==3.17.0 # via # huggingface-hub # torch - # transformers filetype==1.2.0 # via unstructured flatbuffers==25.2.10 @@ -69,76 +57,44 @@ fsspec==2024.5.0 # -c ./local_partition/../common/constraints.txt # huggingface-hub # torch -google-api-core[grpc]==2.24.1 - # via google-cloud-vision -google-auth==2.38.0 - # via - # google-api-core - # google-cloud-vision -google-cloud-vision==3.9.0 - # via unstructured -googleapis-common-protos==1.66.0 - # via - # google-api-core - # grpcio-status -grpcio==1.70.0 - # via - # -c ./local_partition/../common/constraints.txt - # google-api-core - # grpcio-status -grpcio-status==1.62.3 - # via google-api-core h11==0.14.0 - # via httpcore -html5lib==1.1 - # via unstructured -httpcore==1.0.7 - # via httpx -httpx==0.28.1 - # via unstructured-client + # via uvicorn huggingface-hub==0.28.1 # via # timm - # tokenizers - # transformers # unstructured-inference humanfriendly==10.0 # via coloredlogs idna==3.10 # via # anyio - # httpx # requests importlib-resources==6.5.2 # via matplotlib +iopath==0.1.10 + # via layoutparser jinja2==3.1.5 # via torch joblib==1.4.2 # via nltk -jsonpath-python==1.0.6 - # via unstructured-client +jsons==1.6.3 + # via unstructured-inference kiwisolver==1.4.7 # via matplotlib -langdetect==1.0.9 - # via unstructured +layoutparser[layoutmodels, tesseract]==0.3.4 + # via unstructured-inference lxml==5.3.1 - # via - # pikepdf - # unstructured + # via unstructured markupsafe==3.0.2 # via jinja2 marshmallow==3.26.1 # via dataclasses-json matplotlib==3.9.4 - # via - # pycocotools - # unstructured-inference + # via pycocotools mpmath==1.3.0 # via sympy mypy-extensions==1.0.0 # via typing-inspect -nest-asyncio==1.6.0 - # via unstructured-client networkx==3.2.1 # via torch nltk==3.9.1 @@ -147,206 +103,155 @@ numpy==1.26.4 # via # -c ./local_partition/../common/constraints.txt # contourpy + # layoutparser # matplotlib - # onnx # onnxruntime # opencv-python # pandas # pycocotools # scipy # torchvision - # transformers - # unstructured - # unstructured-inference -olefile==0.47 - # via python-oxmsg omegaconf==2.3.0 # via effdet -onnx==1.17.0 - # via - # unstructured - # unstructured-inference onnxruntime==1.19.2 # via unstructured-inference -opencv-python==4.11.0.86 - # via unstructured-inference +opencv-python==4.6.0.66 + # via + # layoutparser + # unstructured-inference packaging==24.2 # via # huggingface-hub # marshmallow # matplotlib # onnxruntime - # pikepdf - # transformers + # pytesseract # unstructured-pytesseract pandas==2.2.3 - # via unstructured-inference + # via layoutparser pdf2image==1.17.0 - # via unstructured + # via + # layoutparser + # unstructured pdfminer-six==20240706 # via + # pdfplumber # unstructured - # unstructured-inference -pi-heif==0.21.0 - # via unstructured -pikepdf==9.5.2 - # via unstructured +pdfplumber==0.5.3 + # via layoutparser pillow==11.1.0 # via + # layoutparser # matplotlib # pdf2image - # pi-heif - # pikepdf + # pdfplumber + # pytesseract # torchvision # unstructured-pytesseract -proto-plus==1.26.0 - # via - # google-api-core - # google-cloud-vision +portalocker==3.1.1 + # via iopath protobuf==4.23.4 # via # -c ./local_partition/../common/constraints.txt - # google-api-core - # google-cloud-vision - # googleapis-common-protos - # grpcio-status - # onnx # onnxruntime - # proto-plus -psutil==6.1.1 - # via unstructured -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth pycocotools==2.0.8 # via effdet pycparser==2.22 # via cffi +pycrypto==2.6.1 + # via pdfplumber pydantic==2.10.6 - # via unstructured-client + # via fastapi pydantic-core==2.27.2 # via pydantic pyparsing==3.2.1 # via matplotlib -pypdf==5.3.0 - # via - # unstructured - # unstructured-client -pypdfium2==4.30.1 - # via unstructured-inference +pytesseract==0.3.13 + # via layoutparser python-dateutil==2.9.0.post0 # via # matplotlib # pandas - # unstructured-client python-iso639==2025.2.8 # via unstructured python-magic==0.4.27 # via unstructured python-multipart==0.0.20 # via unstructured-inference -python-oxmsg==0.0.2 - # via unstructured pytz==2025.1 # via pandas pyyaml==6.0.2 # via # huggingface-hub + # layoutparser # omegaconf # timm - # transformers -rapidfuzz==3.12.1 - # via - # unstructured - # unstructured-inference regex==2024.11.6 - # via - # nltk - # transformers + # via nltk requests==2.32.3 # via - # google-api-core # huggingface-hub - # requests-toolbelt - # transformers # unstructured -requests-toolbelt==1.0.0 - # via unstructured-client -rsa==4.9 - # via google-auth safetensors==0.5.2 - # via - # timm - # transformers + # via timm scipy==1.13.1 - # via unstructured-inference + # via layoutparser six==1.17.0 - # via - # html5lib - # langdetect - # python-dateutil + # via python-dateutil sniffio==1.3.1 # via anyio soupsieve==2.6 # via beautifulsoup4 -sympy==1.13.1 +starlette==0.45.3 + # via fastapi +sympy==1.13.3 # via # onnxruntime # torch +tabulate==0.9.0 + # via unstructured timm==1.0.14 + # via effdet +torch==2.4.1 # via # effdet - # unstructured-inference -tokenizers==0.19.1 - # via - # -c ./local_partition/../common/constraints.txt - # transformers -torch==2.6.0 - # via - # effdet + # layoutparser # timm # torchvision - # unstructured-inference -torchvision==0.21.0 +torchvision==0.19.1 # via # effdet + # layoutparser # timm tqdm==4.67.1 # via # huggingface-hub + # iopath # nltk - # transformers - # unstructured -transformers==4.44.2 - # via unstructured-inference typing-extensions==4.12.2 # via # anyio # beautifulsoup4 + # fastapi # huggingface-hub + # iopath # pydantic # pydantic-core - # pypdf - # python-oxmsg + # starlette # torch # typing-inspect - # unstructured + # uvicorn typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client + # via dataclasses-json +typish==1.9.3 + # via jsons tzdata==2025.1 # via pandas -unstructured[pdf]==0.16.20 +unicodecsv==0.14.1 + # via pdfplumber +unstructured[pdf]==0.10.16 # via -r ./local_partition/pdf.in -unstructured-client==0.29.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -unstructured-inference==0.8.7 +unstructured-inference==0.2.5 # via unstructured unstructured-pytesseract==0.3.13 # via unstructured @@ -354,12 +259,9 @@ urllib3==1.26.20 # via # -c ./local_partition/../common/constraints.txt # requests -webencodings==0.5.1 - # via html5lib -wrapt==1.17.2 - # via - # -c ./local_partition/../common/constraints.txt - # deprecated - # unstructured +uvicorn==0.34.0 + # via unstructured-inference +wand==0.6.13 + # via pdfplumber zipp==3.21.0 # via importlib-resources From 042f9abbe4a9a6fd28932980b9bb9e230dc71b14 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 12 Feb 2025 14:58:30 -0500 Subject: [PATCH 2/5] bump version --- CHANGELOG.md | 2 +- unstructured_ingest/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5efada32b..47c9ac793 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.5.3-dev0 +## 0.5.3-dev1 ### Enhancements diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 83e11ce2f..bb2ef509d 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.5.3-dev0" # pragma: no cover +__version__ = "0.5.3-dev1" # pragma: no cover From 488b69234ff2bfd436b00197b263402984711ad0 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 12 Feb 2025 15:00:45 -0500 Subject: [PATCH 3/5] set constraint on ibis framework --- requirements/common/constraints.txt | 3 ++ requirements/connectors/vastdb.txt | 75 +---------------------------- 2 files changed, 4 insertions(+), 74 deletions(-) diff --git a/requirements/common/constraints.txt b/requirements/common/constraints.txt index bcad8898a..17c9a0662 100644 --- a/requirements/common/constraints.txt +++ b/requirements/common/constraints.txt @@ -35,3 +35,6 @@ lancedb<=0.15.0 # Vulnerability found in versions < 4.48.0 transformers>=4.48.0 + +# Vulnerability found in versions < 7.1.0 +ibis-framework>=7.1.0 diff --git a/requirements/connectors/vastdb.txt b/requirements/connectors/vastdb.txt index 78152f9c7..a0a9e8418 100644 --- a/requirements/connectors/vastdb.txt +++ b/requirements/connectors/vastdb.txt @@ -1,18 +1,7 @@ # This file was autogenerated by uv via the following command: # uv pip compile ./connectors/vastdb.in --output-file ./connectors/vastdb.txt --no-strip-extras --python-version 3.9 -atpublic==3.1.2 - # via ibis-framework aws-requests-auth==0.4.3 # via vastdb -bidict==0.23.1 - # via ibis-framework -boto3==1.34.131 - # via vastdb -botocore==1.34.131 - # via - # -c ./connectors/../common/constraints.txt - # boto3 - # s3transfer certifi==2025.1.31 # via requests charset-normalizer==3.4.1 @@ -21,83 +10,21 @@ flatbuffers==25.2.10 # via vastdb ibis==3.3.0 # via -r ./connectors/vastdb.in -ibis-framework==5.1.0 - # via vastdb idna==3.10 # via requests -jmespath==1.0.1 - # via - # boto3 - # botocore -markdown-it-py==3.0.0 - # via rich -mdurl==0.1.2 - # via markdown-it-py -multipledispatch==0.6.0 - # via ibis-framework -numpy==1.26.4 - # via - # -c ./connectors/../common/constraints.txt - # ibis-framework - # pandas -packaging==24.2 - # via pooch -pandas==2.2.3 - # via ibis-framework -parsy==2.1 - # via ibis-framework -platformdirs==4.3.6 - # via pooch -pooch[progress, xxhash]==1.8.2 - # via ibis-framework pyarrow==19.0.0 # via # -r ./connectors/vastdb.in # vastdb -pygments==2.19.1 - # via rich -python-dateutil==2.9.0.post0 - # via - # botocore - # ibis-framework - # pandas -pytz==2025.1 - # via - # ibis-framework - # pandas requests==2.32.3 # via # aws-requests-auth - # pooch # vastdb -rich==13.9.4 - # via ibis-framework -s3transfer==0.10.4 - # via boto3 -six==1.17.0 - # via - # multipledispatch - # python-dateutil -sqlglot==11.7.1 - # via ibis-framework -toolz==0.12.1 - # via ibis-framework -tqdm==4.67.1 - # via pooch -typing-extensions==4.12.2 - # via - # ibis-framework - # rich -tzdata==2025.1 - # via pandas urllib3==1.26.20 # via # -c ./connectors/../common/constraints.txt - # botocore # requests -vastdb==0.1.2 +vastdb==0.0.5.12 # via -r ./connectors/vastdb.in xmltodict==0.14.2 # via vastdb -xxhash==3.5.0 - # via pooch From 8baece86ab13b2866c919243152b8e3831c778bf Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 12 Feb 2025 16:06:12 -0500 Subject: [PATCH 4/5] drop use of cache folder --- test/unit/v2/embedders/test_huggingface.py | 1 - unstructured_ingest/embed/huggingface.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/test/unit/v2/embedders/test_huggingface.py b/test/unit/v2/embedders/test_huggingface.py index db4738e6f..3e91103c3 100644 --- a/test/unit/v2/embedders/test_huggingface.py +++ b/test/unit/v2/embedders/test_huggingface.py @@ -27,7 +27,6 @@ def generate_embedder_config_params() -> dict: if random.random() < 0.5 else None ) - params["cache_folder"] = fake.file_path() if random.random() < 0.5 else None return params diff --git a/unstructured_ingest/embed/huggingface.py b/unstructured_ingest/embed/huggingface.py index f87bbc0ef..83aec39a1 100644 --- a/unstructured_ingest/embed/huggingface.py +++ b/unstructured_ingest/embed/huggingface.py @@ -22,7 +22,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig): default_factory=lambda: {"device": "cpu"}, alias="model_kwargs" ) encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False}) - cache_folder: Optional[str] = Field(default=None) @requires_dependencies( ["sentence_transformers"], @@ -33,7 +32,6 @@ def get_client(self) -> "SentenceTransformer": return SentenceTransformer( model_name_or_path=self.embedder_model_name, - cache_folder=self.cache_folder, **self.embedder_model_kwargs, ) From db850a3321181447ab1b47c9c9efb373c0c54aa9 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 12 Feb 2025 21:15:03 -0500 Subject: [PATCH 5/5] fix huggingface embedder --- unstructured_ingest/embed/huggingface.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unstructured_ingest/embed/huggingface.py b/unstructured_ingest/embed/huggingface.py index 83aec39a1..627dc28b9 100644 --- a/unstructured_ingest/embed/huggingface.py +++ b/unstructured_ingest/embed/huggingface.py @@ -15,9 +15,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig): - embedder_model_name: Optional[str] = Field( - default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name" - ) + embedder_model_name: Optional[str] = Field(default="all-MiniLM-L6-v2", alias="model_name") embedder_model_kwargs: Optional[dict] = Field( default_factory=lambda: {"device": "cpu"}, alias="model_kwargs" )