Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/increase unit tests #184

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.0.26-dev6
## 0.0.26-dev7

### Enhancements

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ check-version:
###########
.PHONY: unit-test
unit-test:
PYTHONPATH=. pytest test/unit
PYTHONPATH=. pytest -sv --cov unstructured_ingest/ test/unit

.PHONY: integration-test
integration-test:
Expand Down
12 changes: 12 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,15 @@ exclude= [
"build",
"migrations"
]

# limit coverage to v2 code
[tool.coverage.run]
omit=[
"unstructured_ingest/connector/*",
"unstructured_ingest/pipeline/*",
"unstructured_ingest/cli/*",
"unstructured_ingest/runner/*",
"unstructured_ingest/utils/*",
"unstructured_ingest/ingest_backoff/*",
"unstructured_ingest/enhanced_dataclass/*"
]
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pytest-mock
unstructured
pytest-asyncio
pytest_tagging
faker

# Connector specific deps
cryptography
Expand Down
12 changes: 8 additions & 4 deletions requirements/test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This file was autogenerated by uv via the following command:
# uv pip compile test.in --output-file test.txt --no-strip-extras
# uv pip compile test.in --output-file test.txt --no-strip-extras --python-version 3.9
annotated-types==0.7.0
# via pydantic
anyio==4.6.2.post1
Expand Down Expand Up @@ -43,6 +43,8 @@ exceptiongroup==1.2.2
# via
# anyio
# pytest
faker==30.6.0
# via -r test.in
filetype==1.2.0
# via unstructured
fsspec==2024.5.0
Expand Down Expand Up @@ -91,7 +93,7 @@ googleapis-common-protos[grpc]==1.65.0
# grpcio-status
grpc-google-iam-v1==0.13.1
# via google-cloud-resource-manager
grpcio==1.66.2
grpcio==1.67.0
# via
# -c ./common/constraints.txt
# google-api-core
Expand Down Expand Up @@ -121,7 +123,7 @@ langdetect==1.0.9
# via unstructured
lxml==5.3.0
# via unstructured
marshmallow==3.22.0
marshmallow==3.23.0
# via dataclasses-json
mypy-extensions==1.0.0
# via typing-inspect
Expand Down Expand Up @@ -159,7 +161,7 @@ protobuf==4.23.4
# grpc-google-iam-v1
# grpcio-status
# proto-plus
psutil==6.0.0
psutil==6.1.0
# via unstructured
pyasn1==0.6.1
# via
Expand Down Expand Up @@ -194,6 +196,7 @@ pytest-tagging==1.5.3
# via -r test.in
python-dateutil==2.8.2
# via
# faker
# google-cloud-bigquery
# unstructured-client
python-iso639==2024.4.27
Expand Down Expand Up @@ -242,6 +245,7 @@ tqdm==4.66.5
typing-extensions==4.12.2
# via
# anyio
# faker
# pydantic
# pydantic-core
# pypdf
Expand Down
Empty file added test/unit/v2/__init__.py
Empty file.
Empty file.
49 changes: 49 additions & 0 deletions test/unit/v2/chunkers/test_chunkers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import random

import faker
import pytest

from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig

fake = faker.Faker()


def generate_chunker_config_params() -> dict:
params = {}
random_val = random.random()
if random_val < 0.5:
params["chunking_strategy"] = fake.word() if random.random() < 0.5 else None
params["chunk_combine_text_under_n_chars"] = (
fake.random_int() if random.random() < 0.5 else None
)
params["chunk_include_orig_elements"] = fake.boolean() if random.random() < 0.5 else None
params["chunk_max_characters"] = fake.random_int()
params["chunk_multipage_sections"] = fake.boolean()
params["chunk_new_after_n_chars"] = fake.random_int() if random.random() < 0.5 else None
params["chunk_overlap"] = fake.random_int() if random.random() < 0.5 else None
params["chunk_overlap_all"] = fake.boolean() if random.random() < 0.5 else None
if random_val < 0.5:
params["chunk_by_api"] = True
params["chunking_endpoint"] = fake.url()
params["chunk_api_key"] = fake.password()
else:
params["chunk_by_api"] = False

return params


@pytest.mark.parametrize(
"partition_config_params", [generate_chunker_config_params() for i in range(10)]
)
def test_chunker_config(partition_config_params: dict):
chunker_config = ChunkerConfig.model_validate(partition_config_params)
assert chunker_config


@pytest.mark.parametrize(
"partition_config_params", [generate_chunker_config_params() for i in range(10)]
)
def test_chunker(partition_config_params: dict):
chunker_config = ChunkerConfig.model_validate(partition_config_params)
chunker = Chunker(config=chunker_config)
assert chunker
Empty file.
Empty file.
36 changes: 36 additions & 0 deletions test/unit/v2/embedders/test_bedrock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import random

import faker
import pytest

from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder

fake = faker.Faker()


def generate_embedder_config_params() -> dict:
params = {
"aws_access_key_id": fake.password(),
"aws_secret_access_key": fake.password(),
"region_name": fake.city(),
}
if random.random() < 0.5:
params["embed_model_name"] = fake.word()
return params


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder_config(embedder_config_params: dict):
embedder_config = BedrockEmbeddingConfig.model_validate(embedder_config_params)
assert embedder_config


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder(embedder_config_params: dict):
embedder_config = BedrockEmbeddingConfig.model_validate(embedder_config_params)
embedder = BedrockEmbeddingEncoder(config=embedder_config)
assert embedder
48 changes: 48 additions & 0 deletions test/unit/v2/embedders/test_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import random
from typing import Any

import faker
import pytest

from test.unit.v2.utils.data_generator import generate_random_dictionary
from unstructured_ingest.embed.huggingface import (
HuggingFaceEmbeddingConfig,
HuggingFaceEmbeddingEncoder,
)

fake = faker.Faker()


def generate_embedder_config_params() -> dict:
params = {}
if random.random() < 0.5:
params["embed_model_name"] = fake.word() if random.random() < 0.5 else None
params["embedder_model_kwargs"] = (
generate_random_dictionary(key_type=str, value_type=Any)
if random.random() < 0.5
else None
)
params["encode_kwargs"] = (
generate_random_dictionary(key_type=str, value_type=Any)
if random.random() < 0.5
else None
)
params["cache_folder"] = fake.file_path() if random.random() < 0.5 else None
return params


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder_config(embedder_config_params: dict):
embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
assert embedder_config


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder(embedder_config_params: dict):
embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
embedder = HuggingFaceEmbeddingEncoder(config=embedder_config)
assert embedder
37 changes: 37 additions & 0 deletions test/unit/v2/embedders/test_mixedbread.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import random

import faker
import pytest

from unstructured_ingest.embed.mixedbreadai import (
MixedbreadAIEmbeddingConfig,
MixedbreadAIEmbeddingEncoder,
)

fake = faker.Faker()


def generate_embedder_config_params() -> dict:
params = {
"api_key": fake.password(),
}
if random.random() < 0.5:
params["embedder_model_name"] = fake.word()
return params


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder_config(embedder_config_params: dict):
embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
assert embedder_config


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder(embedder_config_params: dict):
embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
embedder = MixedbreadAIEmbeddingEncoder(config=embedder_config)
assert embedder
35 changes: 35 additions & 0 deletions test/unit/v2/embedders/test_octoai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import random

import faker
import pytest

from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder

fake = faker.Faker()


def generate_embedder_config_params() -> dict:
params = {
"api_key": fake.password(),
}
if random.random() < 0.5:
params["embedder_model_name"] = fake.word()
params["base_url"] = fake.url()
return params


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder_config(embedder_config_params: dict):
embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
assert embedder_config


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder(embedder_config_params: dict):
embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
embedder = OctoAIEmbeddingEncoder(config=embedder_config)
assert embedder
35 changes: 35 additions & 0 deletions test/unit/v2/embedders/test_openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import random

import faker
import pytest

from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder

fake = faker.Faker()


def generate_embedder_config_params() -> dict:
params = {
"api_key": fake.password(),
}
if random.random() < 0.5:
params["embedder_model_name"] = fake.word()
params["base_url"] = fake.url()
return params


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder_config(embedder_config_params: dict):
embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
assert embedder_config


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder(embedder_config_params: dict):
embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
embedder = OpenAIEmbeddingEncoder(config=embedder_config)
assert embedder
37 changes: 37 additions & 0 deletions test/unit/v2/embedders/test_togetherai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import random

import faker
import pytest

from unstructured_ingest.embed.togetherai import (
TogetherAIEmbeddingConfig,
TogetherAIEmbeddingEncoder,
)

fake = faker.Faker()


def generate_embedder_config_params() -> dict:
params = {
"api_key": fake.password(),
}
if random.random() < 0.5:
params["embedder_model_name"] = fake.word()
return params


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder_config(embedder_config_params: dict):
embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
assert embedder_config


@pytest.mark.parametrize(
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
)
def test_embedder(embedder_config_params: dict):
embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
embedder = TogetherAIEmbeddingEncoder(config=embedder_config)
assert embedder
Loading