feat: added ability to request one file as multipart/form data (#167)

Unstructured-IO · Apr 25, 2023 · 0628540 · 0628540
1 parent 9eded4e
commit 0628540
Show file tree

Hide file tree

Showing 19 changed files with 712 additions and 733 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 0.10.1
+
+* Add Ability to request one file as multipart/form data
+
 # 0.10.0
 
 * Update templates for generated API. 

diff --git a/test_unstructured_api_tools/api/test_file_apis.py b/test_unstructured_api_tools/api/test_file_apis.py
@@ -209,8 +209,7 @@ def _json_for_one_file(test_file):
         ([FILE_IMAGE], P_INPUT_1_AND_2_MULTI, JSON, 200, None),
         ([FILE_DOCX, FILE_IMAGE], P_INPUT_1_AND_2_MULTI, JSON, 200, None),
         ([FILE_DOCX, FILE_IMAGE], P_INPUT_1_AND_2_MULTI, MIXED, 200, None),
-        # json returned though mixed requested (maybe not a bug for 1 file?)
-        pytest.param([FILE_DOCX], P_INPUT_1_MULTI, MIXED, 200, None, marks=pytest.mark.xfail),
+        ([FILE_DOCX], P_INPUT_1_MULTI, MIXED, 200, None),
         # json returned though csv requested
         pytest.param(
             [FILE_IMAGE], P_INPUT_1_AND_2_MULTI, TEXT_CSV, 200, None, marks=pytest.mark.xfail
@@ -309,10 +308,7 @@ def test_process_file_2(
     "gz_content_type",
     [
         ([FILE_DOCX], JSON, RESPONSE_SCHEMA_ISD, 200, False, None, None),
-        # endpoint doesn't accept mixed media type for one file
-        pytest.param(
-            [FILE_DOCX], MIXED, RESPONSE_SCHEMA_ISD, 200, False, None, None, marks=pytest.mark.xfail
-        ),
+        ([FILE_DOCX], MIXED, RESPONSE_SCHEMA_ISD, 200, False, None, None),
         # endpoint fails because media type text/csv should have response type str
         pytest.param(
             [FILE_DOCX],
@@ -330,7 +326,6 @@ def test_process_file_2(
             [FILE_DOCX], None, RESPONSE_SCHEMA_ISD, 200, False, None, None, marks=pytest.mark.xfail
         ),
         ([FILE_DOCX], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 200, False, None, None),
-        # endpoint doesn't accept mixed media type for one file
         pytest.param(
             [FILE_DOCX],
             MIXED,
@@ -339,7 +334,6 @@ def test_process_file_2(
             False,
             None,
             None,
-            marks=pytest.mark.xfail,
         ),
         # endpoint fails because media type text/csv should have response type str
         pytest.param(
@@ -377,7 +371,7 @@ def test_process_file_2(
             None,
             marks=pytest.mark.xfail,
         ),
-        ([FILE_DOCX, FILE_IMAGE], None, RESPONSE_SCHEMA_ISD, 200, False, None, None),
+        ([FILE_DOCX, FILE_IMAGE], None, RESPONSE_SCHEMA_ISD, 406, False, None, None),
         ([FILE_DOCX, FILE_IMAGE], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 200, False, None, None),
         ([FILE_DOCX, FILE_IMAGE], MIXED, RESPONSE_SCHEMA_LABELSTUDIO, 200, False, None, None),
         # endpoint fails because text/csv is not acceptable for multiple files
@@ -391,12 +385,12 @@ def test_process_file_2(
             None,
             marks=pytest.mark.xfail,
         ),
-        ([FILE_DOCX, FILE_IMAGE], None, RESPONSE_SCHEMA_LABELSTUDIO, 200, False, None, None),
+        ([FILE_DOCX, FILE_IMAGE], None, RESPONSE_SCHEMA_LABELSTUDIO, 406, False, None, None),
         (
             [FILE_DOCX, FILE_IMAGE, GZIP_FILE_IMAGE],
             None,
             RESPONSE_SCHEMA_LABELSTUDIO,
-            200,
+            406,
             False,
             None,
             None,
@@ -405,7 +399,7 @@ def test_process_file_2(
             [FILE_DOCX, FILE_IMAGE, GZIP_FILE_DOCX],
             None,
             RESPONSE_SCHEMA_LABELSTUDIO,
-            200,
+            406,
             False,
             None,
             None,
@@ -414,7 +408,7 @@ def test_process_file_2(
             [FILE_DOCX, FILE_IMAGE, GZIP_FILE_IMAGE, GZIP_FILE_DOCX],
             None,
             RESPONSE_SCHEMA_LABELSTUDIO,
-            200,
+            406,
             False,
             None,
             None,
@@ -629,7 +623,7 @@ def test_process_file_3(
             False,
             None,
         ),
-        ([FILE_DOCX], MIXED, RESPONSE_SCHEMA_ISD, P_INPUT_1_SINGLE, 406, None, False, None),
+        ([FILE_DOCX], MIXED, RESPONSE_SCHEMA_ISD, P_INPUT_1_SINGLE, 200, None, False, None),
         ([], MIXED, RESPONSE_SCHEMA_ISD, P_INPUT_1_SINGLE, 400, None, False, None),
         (
             [GZIP_FILE_DOCX],
@@ -914,7 +908,7 @@ def test_process_file_4(
             RESPONSE_SCHEMA_ISD,
             P_INPUT_1_MULTI,
             P_INPUT_2_EMPTY,
-            406,
+            200,
             False,
             None,
             None,

diff --git a/test_unstructured_api_tools/api/test_file_text_apis.py b/test_unstructured_api_tools/api/test_file_text_apis.py
@@ -456,7 +456,7 @@ def test_process_file_text_1(
         ([FILE_MARKDOWN], [FILE_TXT_1], TEXT_CSV, P_INPUT_2_MULTI, 406, False, None, None),
         ([], [FILE_TXT_1], JSON, P_INPUT_2_SINGLE, 200, False, None, None),
         ([FILE_DOCX], [], JSON, P_INPUT_2_SINGLE, 200, False, None, None),
-        ([], [FILE_TXT_1], MIXED, P_INPUT_2_EMPTY, 406, False, None, None),
+        ([], [FILE_TXT_1], MIXED, P_INPUT_2_EMPTY, 200, False, None, None),
         (
             [GZIP_FILE_DOCX],
             [FILE_TXT_1],
@@ -687,7 +687,7 @@ def test_process_file_text_2(
         ),
         ([], [FILE_TXT_1], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 200, False, None, None),
         ([FILE_DOCX], [], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 200, False, None, None),
-        ([FILE_DOCX], [], MIXED, RESPONSE_SCHEMA_LABELSTUDIO, 406, False, None, None),
+        ([FILE_DOCX], [], MIXED, RESPONSE_SCHEMA_LABELSTUDIO, 200, False, None, None),
         (
             [GZIP_FILE_DOCX],
             [FILE_TXT_1],
@@ -1063,7 +1063,7 @@ def test_process_file_text_3(
             RESPONSE_SCHEMA_LABELSTUDIO,
             P_INPUT_1_EMPTY,
             P_INPUT_2_EMPTY,
-            406,
+            200,
             False,
             None,
             None,

diff --git a/test_unstructured_api_tools/api/test_text_apis.py b/test_unstructured_api_tools/api/test_text_apis.py
@@ -390,8 +390,7 @@ def test_process_text_2(
     [
         ([FILE_TXT_1], JSON, 200, False, None, None),
         ([GZIP_FILE_TXT_1], JSON, 200, False, None, None),
-        # endpoint doesn't accept mixed media type for one file
-        pytest.param([FILE_TXT_1], MIXED, 200, False, None, None, marks=pytest.mark.xfail),
+        ([FILE_TXT_1], MIXED, 200, False, None, None),
         # endpoint fails because media type text/csv should have response type str
         pytest.param([FILE_TXT_1], TEXT_CSV, 200, False, None, None, marks=pytest.mark.xfail),
         # endpoint fails because media type text/csv should have response type str
@@ -421,19 +420,18 @@ def test_process_text_2(
             None,
             marks=pytest.mark.xfail,
         ),
-        ([FILE_TXT_1, FILE_TXT_2], None, 200, False, None, None),
+        ([FILE_TXT_1, FILE_TXT_2], None, 406, False, None, None),
         ([FILE_TXT_2], JSON, 200, False, None, None),
         ([GZIP_FILE_TXT_2], JSON, 200, False, None, None),
-        # endpoint doesn't accept mixed media type for one file
-        pytest.param([FILE_TXT_2], MIXED, 200, False, None, None, marks=pytest.mark.xfail),
+        ([FILE_TXT_2], MIXED, 200, False, None, None),
         # endpoint fails because media type text/csv should have response type str
         pytest.param([FILE_TXT_2], TEXT_CSV, 200, False, None, None, marks=pytest.mark.xfail),
         # endpoint fails because media type text/csv should have response type str
         # because None response type has default text/csv value
         pytest.param([FILE_TXT_2], None, 200, False, None, None, marks=pytest.mark.xfail),
-        ([FILE_TXT_2, FILE_MARKDOWN], None, 200, True, None, None),
-        ([FILE_TXT_2, FILE_TXT_1], None, 200, False, FILENAME_FORMATS[FILE_TXT_1], None),
-        ([FILE_TXT_2, FILE_MARKDOWN], None, 400, False, FILENAME_FORMATS[FILE_TXT_1], None),
+        ([FILE_TXT_2, FILE_MARKDOWN], None, 406, True, None, None),
+        ([FILE_TXT_2, FILE_TXT_1], None, 406, False, FILENAME_FORMATS[FILE_TXT_1], None),
+        ([FILE_TXT_2, FILE_MARKDOWN], None, 406, False, FILENAME_FORMATS[FILE_TXT_1], None),
         ([], None, 400, False, None, None),
         ([GZIP_FILE_TXT_1], JSON, 200, False, None, FILENAME_FORMATS[FILE_TXT_1]),
     ],
@@ -513,7 +511,7 @@ def test_process_text_3(
             None,
         ),
         ([FILE_TXT_1, FILE_TXT_2], TEXT_CSV, RESPONSE_SCHEMA_ISD, 406, False, None, None),
-        ([FILE_TXT_1], MIXED, RESPONSE_SCHEMA_ISD, 406, False, None, None),
+        ([FILE_TXT_1], MIXED, RESPONSE_SCHEMA_ISD, 200, False, None, None),
         ([], JSON, RESPONSE_SCHEMA_ISD, 400, False, None, None),
         (
             [GZIP_FILE_TXT_1],

diff --git a/..._unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py b/..._unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py
@@ -181,55 +181,45 @@ def pipeline_1(
                 "multipart/mixed",
                 "application/json",
             ]:
-                return PlainTextResponse(
-                    content=(
+                raise HTTPException(
+                    detail=(
                         f"Conflict in media type {content_type}"
                         ' with response type "multipart/mixed".\n'
                     ),
                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
                 )
 
-            def response_generator(is_multipart):
-                for file in files:
-                    file_content_type = get_validated_mimetype(file)
-
-                    _file = file.file
-
-                    response = pipeline_api(
-                        _file,
-                        m_input2=input2,
-                        filename=file.filename,
-                        file_content_type=file_content_type,
-                    )
-                    if is_multipart:
-                        if type(response) not in [str, bytes]:
-                            response = json.dumps(response)
-                    yield response
-
-            if content_type == "multipart/mixed":
-                return MultipartMixedResponse(
-                    response_generator(is_multipart=True),
-                )
-            else:
-                return response_generator(is_multipart=False)
-        else:
-            file = files[0]
-            _file = file.file
+        def response_generator(is_multipart):
+            for file in files:
+                file_content_type = get_validated_mimetype(file)
 
-            file_content_type = get_validated_mimetype(file)
+                _file = file.file
 
-            response = pipeline_api(
-                _file,
-                m_input2=input2,
-                filename=file.filename,
-                file_content_type=file_content_type,
-            )
+                response = pipeline_api(
+                    _file,
+                    m_input2=input2,
+                    filename=file.filename,
+                    file_content_type=file_content_type,
+                )
 
-            return response
+                if is_multipart:
+                    if type(response) not in [str, bytes]:
+                        response = json.dumps(response)
+                yield response
 
+        if content_type == "multipart/mixed":
+            return MultipartMixedResponse(
+                response_generator(is_multipart=True),
+            )
+        else:
+            return (
+                list(response_generator(is_multipart=False))[0]
+                if len(files) == 1
+                else response_generator(is_multipart=False)
+            )
     else:
-        return PlainTextResponse(
-            content='Request parameter "files" is required.\n',
+        raise HTTPException(
+            detail='Request parameter "files" is required.\n',
             status_code=status.HTTP_400_BAD_REQUEST,
         )
 

diff --git a/..._unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py b/..._unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py
@@ -168,49 +168,42 @@ def pipeline_1(
                 "multipart/mixed",
                 "application/json",
             ]:
-                return PlainTextResponse(
-                    content=(
+                raise HTTPException(
+                    detail=(
                         f"Conflict in media type {content_type}"
                         ' with response type "multipart/mixed".\n'
                     ),
                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
                 )
 
-            def response_generator(is_multipart):
-                for file in files:
-                    get_validated_mimetype(file)
+        def response_generator(is_multipart):
+            for file in files:
+                get_validated_mimetype(file)
 
-                    _file = file.file
+                _file = file.file
 
-                    response = pipeline_api(
-                        _file,
-                    )
-                    if is_multipart:
-                        if type(response) not in [str, bytes]:
-                            response = json.dumps(response)
-                    yield response
-
-            if content_type == "multipart/mixed":
-                return MultipartMixedResponse(
-                    response_generator(is_multipart=True),
+                response = pipeline_api(
+                    _file,
                 )
-            else:
-                return response_generator(is_multipart=False)
-        else:
-            file = files[0]
-            _file = file.file
 
-            get_validated_mimetype(file)
+                if is_multipart:
+                    if type(response) not in [str, bytes]:
+                        response = json.dumps(response)
+                yield response
 
-            response = pipeline_api(
-                _file,
+        if content_type == "multipart/mixed":
+            return MultipartMixedResponse(
+                response_generator(is_multipart=True),
+            )
+        else:
+            return (
+                list(response_generator(is_multipart=False))[0]
+                if len(files) == 1
+                else response_generator(is_multipart=False)
             )
-
-            return response
-
     else:
-        return PlainTextResponse(
-            content='Request parameter "files" is required.\n',
+        raise HTTPException(
+            detail='Request parameter "files" is required.\n',
             status_code=status.HTTP_400_BAD_REQUEST,
         )