run-llama · EmanuelCampos · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py
@@ -23,28 +23,52 @@
 
 class FileSystemReaderMixin(ABC):
     @abstractmethod
-    def read_file_content(self, input_file: Path, **kwargs: Any) -> bytes:
+    def read_file_content(self, resource_id: str, **kwargs: Any) -> bytes:
         """
         Read the bytes content of a file.
 
         Args:
-            input_file (Path): Path to the file.
+            resource_id (str): Resource ID.
 
         Returns:
             bytes: File content.
         """
 
-    async def aread_file_content(self, input_file: Path, **kwargs: Any) -> bytes:
+    async def aread_file_content(self, resource_id: str, **kwargs: Any) -> bytes:
         """
         Read the bytes content of a file asynchronously.
 
         Args:
-            input_file (Path): Path to the file.
+            resource_id (str): Resource ID.
 
         Returns:
             bytes: File content.
         """
-        return self.read_file_content(input_file, **kwargs)
+        return self.read_file_content(resource_id, **kwargs)
+
+    @abstractmethod
+    def get_file_name(self, resource_id: str) -> str:
+        """
+        Get the file name from the resource ID.
+        """
+
+    async def aget_file_name(self, resource_id: str) -> str:
+        """
+        Get the file name from the resource ID asynchronously.
+        """
+        return self.get_file_name(resource_id)
+
+    @abstractmethod
+    def get_file_path(self, resource_id: str) -> str:
+        """
+        Get the file path from the resource ID.
+        """
+
+    async def aget_file_path(self, resource_id: str) -> str:
+        """
+        Get the file path from the resource ID asynchronously.
+        """
+        return self.get_file_path(resource_id)
 
 
 def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
@@ -465,10 +489,12 @@ async def aload_resource(
             **kwargs,
         )
 
-    def read_file_content(self, input_file: Path, **kwargs: Any) -> bytes:
+    def read_file_content(self, resource_id: str, **kwargs: Any) -> bytes:
         """Read file content."""
         fs: fsspec.AbstractFileSystem = kwargs.get("fs", self.fs)
-        with fs.open(input_file, errors=self.errors, encoding=self.encoding) as f:
+        with fs.open(
+            Path(resource_id), errors=self.errors, encoding=self.encoding
+        ) as f:
             return f.read()
 
     @staticmethod
@@ -792,3 +818,9 @@ def iter_data(
 
             if len(documents) > 0:
                 yield documents
+
+    def get_file_name(self, resource_id: str) -> str:
+        return Path(resource_id).name
+
+    def get_file_path(self, resource_id: str) -> str:
+        return resource_id
diff --git a/...ons/readers/llama-index-readers-azstorage-blob/llama_index/readers/azstorage_blob/base.py b/...ons/readers/llama-index-readers-azstorage-blob/llama_index/readers/azstorage_blob/base.py
@@ -202,10 +202,10 @@ def load_resource(self, resource_id: str, **kwargs: Any) -> List[Document]:
             )
             raise
 
-    def read_file_content(self, input_file: Path, **kwargs) -> bytes:
+    def read_file_content(self, resource_id: str, **kwargs: Any) -> bytes:
         """Read the content of a file from Azure Storage Blob."""
         container_client = self._get_container_client()
-        blob_client = container_client.get_blob_client(input_file)
+        blob_client = container_client.get_blob_client(resource_id)
         stream = blob_client.download_blob()
         return stream.readall()
 
@@ -230,3 +230,9 @@ def load_data(self) -> List[Document]:
             logger.info("Document creation starting")
 
             return self._load_documents_with_metadata(files_metadata, temp_dir)
+
+    def get_file_name(self, resource_id: str) -> str:
+        return Path(resource_id).name
+
+    def get_file_path(self, resource_id: str) -> str:
+        return resource_id
diff --git a/...ex-integrations/readers/llama-index-readers-box/llama_index/readers/box/BoxReader/base.py b/...ex-integrations/readers/llama-index-readers-box/llama_index/readers/box/BoxReader/base.py
@@ -133,8 +133,8 @@ def list_resources(
             )
         return [file.id for file in box_files]
 
-    def read_file_content(self, input_file: Path, **kwargs) -> bytes:
-        file_id = input_file.name
+    def read_file_content(self, resource_id: str, **kwargs) -> bytes:
+        file_id = Path(resource_id).name
         return get_file_content_by_id(box_client=self._box_client, box_file_id=file_id)
 
     def search_resources(
@@ -378,3 +378,12 @@ def _download_files(self, box_files: List[File], temp_dir: str) -> List[File]:
             file.downloaded_file_path = local_path
             box_files_with_path.append(file)
         return box_files_with_path
+
+    def get_file_name(self, resource_id: str) -> str:
+        box_file = get_box_files_details(
+            box_client=self._box_client, file_ids=[resource_id]
+        )
+        return box_file[0].name
+
+    def get_file_path(self, resource_id: str) -> str:
+        return self.get_file_name(resource_id)
diff --git a/llama-index-integrations/readers/llama-index-readers-gcs/llama_index/readers/gcs/base.py b/llama-index-integrations/readers/llama-index-readers-gcs/llama_index/readers/gcs/base.py
@@ -251,12 +251,12 @@ def load_resource(self, resource_id: str, **kwargs) -> List[Document]:
             logger.error(f"Error loading resource from GCS: {e!s}")
             raise
 
-    def read_file_content(self, input_file: Path, **kwargs) -> bytes:
+    def read_file_content(self, resource_id: str, **kwargs) -> bytes:
         """
         Read the content of a specific file from GCS.
 
         Args:
-            input_file (Path): The path to the file to read.
+            resource_id (str): The resource ID to read.
             **kwargs: Additional arguments to pass to the underlying read_file_content method.
 
         Returns:
@@ -266,10 +266,16 @@ def read_file_content(self, input_file: Path, **kwargs) -> bytes:
             Exception: If there's an error reading the file content.
         """
         try:
-            logger.info(f"Reading file content: {input_file}")
+            logger.info(f"Reading file content: {resource_id}")
             return self._get_simple_directory_reader().read_file_content(
-                input_file, **kwargs
+                Path(resource_id), **kwargs
             )
         except Exception as e:
             logger.error(f"Error reading file content from GCS: {e!s}")
             raise
+
+    def get_file_name(self, resource_id: str) -> str:
+        return Path(resource_id).name
+
+    def get_file_path(self, resource_id: str) -> str:
+        return resource_id
diff --git a/...-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py b/...-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py
@@ -680,12 +680,34 @@ def load_resource(self, resource_id: str, **kwargs) -> List[Document]:
             self.drive_id, [resource_id], None, self.query_string
         )
 
-    def read_file_content(self, file_path: Union[str, Path], **kwargs) -> bytes:
+    def read_file_content(self, resource_id: str, **kwargs) -> bytes:
         """Read the content of a specific file from Google Drive."""
         self._creds = self._get_credentials()
 
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_file = os.path.join(temp_dir, "temp_file")
-            downloaded_file = self._download_file(file_path, temp_file)
+            downloaded_file = self._download_file(resource_id, temp_file)
             with open(downloaded_file, "rb") as file:
                 return file.read()
+
+    def get_file_name(self, resource_id: str) -> str:
+        """Get the file name from the resource ID."""
+        from googleapiclient.discovery import build
+
+        self._creds = self._get_credentials()
+
+        service = build("drive", "v3", credentials=self._creds)
+        file = service.files().get(fileId=resource_id, supportsAllDrives=True).execute()
+        return file["name"]
+
+    def get_file_path(self, resource_id: str) -> str:
+        """Get the file path from the resource ID."""
+        from googleapiclient.discovery import build
+
+        self._creds = self._get_credentials()
+
+        service = build("drive", "v3", credentials=self._creds)
+
+        return self._get_relative_path(
+            service, file_id=resource_id, folder_id=self.folder_id
+        )
diff --git a/...ers/llama-index-readers-microsoft-onedrive/llama_index/readers/microsoft_onedrive/base.py b/...ers/llama-index-readers-microsoft-onedrive/llama_index/readers/microsoft_onedrive/base.py
@@ -2,10 +2,10 @@
 
 import logging
 import os
+from pathlib import Path
 import tempfile
 import time
 from typing import Any, Dict, List, Optional, Union
-from pathlib import Path
 
 import requests
 from llama_index.core.readers import SimpleDirectoryReader
@@ -767,7 +767,7 @@ def list_resources(
                 recursive=recursive,
                 userprincipalname=userprincipalname,
             )
-            return [payload.resource_info["file_path"] for payload in payloads]
+            return [str(payload.resource_info["file_path"]) for payload in payloads]
         except Exception as e:
             logger.error(
                 f"An error occurred while listing resources: {e}", exc_info=True
@@ -819,20 +819,26 @@ async def aload_resource(
     ) -> List[Document]:
         return self.load_resource(resource_id, *args, **kwargs)
 
-    def read_file_content(self, input_file: Path, **kwargs) -> bytes:
+    def read_file_content(self, resource_id: str, **kwargs) -> bytes:
         with tempfile.TemporaryDirectory() as temp_dir:
             payloads = self._get_downloaded_files_metadata(
-                file_paths=[str(input_file)], temp_dir=temp_dir, **kwargs
+                file_paths=[resource_id], temp_dir=temp_dir, **kwargs
             )
             local_file_path = next(
                 payloads.downloaded_file_path
                 for payloads in payloads
-                if payloads.resource_info["file_path"] == str(input_file)
+                if payloads.resource_info["file_path"] == resource_id
             )
             if not local_file_path:
                 raise ValueError("File was not downloaded successfully.")
             with open(local_file_path, "rb") as f:
                 return f.read()
 
-    async def aread_file_content(self, input_file: Path, **kwargs) -> bytes:
-        return self.read_file_content(input_file, **kwargs)
+    async def aread_file_content(self, resource_id: str, **kwargs) -> bytes:
+        return self.read_file_content(resource_id, **kwargs)
+
+    def get_file_name(self, resource_id: str) -> str:
+        return Path(resource_id).name
+
+    def get_file_path(self, resource_id: str) -> str:
+        return resource_id
diff --git a/...llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py b/...llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py
@@ -688,7 +688,7 @@ def list_resources(
         sharepoint_folder_id: Optional[str] = None,
         sharepoint_site_id: Optional[str] = None,
         recursive: bool = True,
-    ) -> List[Path]:
+    ) -> List[str]:
         """
         Lists the files in the specified folder in the SharePoint site.
 
@@ -738,11 +738,11 @@ def list_resources(
                     recursive,
                     os.path.join(sharepoint_site_name, sharepoint_folder_path),
                 )
-                file_paths.extend(folder_contents)
+                file_paths.extend([str(path) for path in folder_contents])
             else:
                 # Fetch drive contents
                 drive_contents = self._list_drive_contents()
-                file_paths.extend(drive_contents)
+                file_paths.extend([str(path) for path in drive_contents])
         except Exception as exp:
             logger.error("An error occurred while listing files in SharePoint: %s", exp)
             raise
@@ -847,19 +847,25 @@ def load_resource(self, resource_id: str, **kwargs) -> List[Document]:
             )
             raise
 
-    def read_file_content(self, input_file: Path, **kwargs) -> bytes:
+    def read_file_content(self, resource_id: str, **kwargs) -> bytes:
         try:
             access_token = self._get_access_token()
             self._site_id_with_host_name = self._get_site_id_with_host_name(
                 access_token, self.sharepoint_site_name
             )
             self._drive_id = self._get_drive_id()
 
-            item = self._get_item_from_path(input_file)
+            item = self._get_item_from_path(Path(resource_id))
             return self._get_file_content_by_url(item)
 
         except Exception as exp:
             logger.error(
                 "An error occurred while reading file content from SharePoint: %s", exp
             )
             raise
+
+    def get_file_name(self, resource_id: str) -> str:
+        return Path(resource_id).name
+
+    def get_file_path(self, resource_id: str) -> str:
+        return resource_id
diff --git a/llama-index-integrations/readers/llama-index-readers-s3/llama_index/readers/s3/base.py b/llama-index-integrations/readers/llama-index-readers-s3/llama_index/readers/s3/base.py
@@ -210,6 +210,12 @@ def load_resource(self, resource_id: str, **kwargs) -> List[Document]:
         docs = simple_directory_reader.load_resource(resource_id, **kwargs)
         return self._adjust_documents(docs)
 
-    def read_file_content(self, input_file: Path, **kwargs) -> bytes:
+    def read_file_content(self, resource_id: str, **kwargs) -> bytes:
         simple_directory_reader = self._get_simple_directory_reader()
-        return simple_directory_reader.read_file_content(input_file, **kwargs)
+        return simple_directory_reader.read_file_content(Path(resource_id), **kwargs)
+
+    def get_file_name(self, resource_id: str) -> str:
+        return Path(resource_id).name
+
+    def get_file_path(self, resource_id: str) -> str:
+        return resource_id