Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add get_file_name and get file_path to filesystem readers #17075

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
46 changes: 39 additions & 7 deletions llama-index-core/llama_index/core/readers/file/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,52 @@

class FileSystemReaderMixin(ABC):
@abstractmethod
def read_file_content(self, input_file: Path, **kwargs: Any) -> bytes:
def read_file_content(self, resource_id: str, **kwargs: Any) -> bytes:
"""
Read the bytes content of a file.

Args:
input_file (Path): Path to the file.
resource_id (str): Resource ID.

Returns:
bytes: File content.
"""

async def aread_file_content(self, input_file: Path, **kwargs: Any) -> bytes:
async def aread_file_content(self, resource_id: str, **kwargs: Any) -> bytes:
"""
Read the bytes content of a file asynchronously.

Args:
input_file (Path): Path to the file.
resource_id (str): Resource ID.

Returns:
bytes: File content.
"""
return self.read_file_content(input_file, **kwargs)
return self.read_file_content(resource_id, **kwargs)

@abstractmethod
def get_file_name(self, resource_id: str) -> str:
"""
Get the file name from the resource ID.
"""

async def aget_file_name(self, resource_id: str) -> str:
"""
Get the file name from the resource ID asynchronously.
"""
return self.get_file_name(resource_id)

@abstractmethod
def get_file_path(self, resource_id: str) -> str:
"""
Get the file path from the resource ID.
"""

async def aget_file_path(self, resource_id: str) -> str:
"""
Get the file path from the resource ID asynchronously.
"""
return self.get_file_path(resource_id)


def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
Expand Down Expand Up @@ -465,10 +489,12 @@ async def aload_resource(
**kwargs,
)

def read_file_content(self, input_file: Path, **kwargs: Any) -> bytes:
def read_file_content(self, resource_id: str, **kwargs: Any) -> bytes:
"""Read file content."""
fs: fsspec.AbstractFileSystem = kwargs.get("fs", self.fs)
with fs.open(input_file, errors=self.errors, encoding=self.encoding) as f:
with fs.open(
Path(resource_id), errors=self.errors, encoding=self.encoding
) as f:
return f.read()

@staticmethod
Expand Down Expand Up @@ -792,3 +818,9 @@ def iter_data(

if len(documents) > 0:
yield documents

def get_file_name(self, resource_id: str) -> str:
return Path(resource_id).name

def get_file_path(self, resource_id: str) -> str:
return resource_id
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,10 @@ def load_resource(self, resource_id: str, **kwargs: Any) -> List[Document]:
)
raise

def read_file_content(self, input_file: Path, **kwargs) -> bytes:
def read_file_content(self, resource_id: str, **kwargs: Any) -> bytes:
"""Read the content of a file from Azure Storage Blob."""
container_client = self._get_container_client()
blob_client = container_client.get_blob_client(input_file)
blob_client = container_client.get_blob_client(resource_id)
stream = blob_client.download_blob()
return stream.readall()

Expand All @@ -230,3 +230,9 @@ def load_data(self) -> List[Document]:
logger.info("Document creation starting")

return self._load_documents_with_metadata(files_metadata, temp_dir)

def get_file_name(self, resource_id: str) -> str:
return Path(resource_id).name

def get_file_path(self, resource_id: str) -> str:
return resource_id
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ def list_resources(
)
return [file.id for file in box_files]

def read_file_content(self, input_file: Path, **kwargs) -> bytes:
file_id = input_file.name
def read_file_content(self, resource_id: str, **kwargs) -> bytes:
file_id = Path(resource_id).name
return get_file_content_by_id(box_client=self._box_client, box_file_id=file_id)

def search_resources(
Expand Down Expand Up @@ -378,3 +378,12 @@ def _download_files(self, box_files: List[File], temp_dir: str) -> List[File]:
file.downloaded_file_path = local_path
box_files_with_path.append(file)
return box_files_with_path

def get_file_name(self, resource_id: str) -> str:
box_file = get_box_files_details(
box_client=self._box_client, file_ids=[resource_id]
)
return box_file[0].name

def get_file_path(self, resource_id: str) -> str:
return self.get_file_name(resource_id)
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,12 @@ def load_resource(self, resource_id: str, **kwargs) -> List[Document]:
logger.error(f"Error loading resource from GCS: {e!s}")
raise

def read_file_content(self, input_file: Path, **kwargs) -> bytes:
def read_file_content(self, resource_id: str, **kwargs) -> bytes:
"""
Read the content of a specific file from GCS.

Args:
input_file (Path): The path to the file to read.
resource_id (str): The resource ID to read.
**kwargs: Additional arguments to pass to the underlying read_file_content method.

Returns:
Expand All @@ -266,10 +266,16 @@ def read_file_content(self, input_file: Path, **kwargs) -> bytes:
Exception: If there's an error reading the file content.
"""
try:
logger.info(f"Reading file content: {input_file}")
logger.info(f"Reading file content: {resource_id}")
return self._get_simple_directory_reader().read_file_content(
input_file, **kwargs
Path(resource_id), **kwargs
)
except Exception as e:
logger.error(f"Error reading file content from GCS: {e!s}")
raise

def get_file_name(self, resource_id: str) -> str:
return Path(resource_id).name

def get_file_path(self, resource_id: str) -> str:
return resource_id
Original file line number Diff line number Diff line change
Expand Up @@ -680,12 +680,34 @@ def load_resource(self, resource_id: str, **kwargs) -> List[Document]:
self.drive_id, [resource_id], None, self.query_string
)

def read_file_content(self, file_path: Union[str, Path], **kwargs) -> bytes:
def read_file_content(self, resource_id: str, **kwargs) -> bytes:
"""Read the content of a specific file from Google Drive."""
self._creds = self._get_credentials()

with tempfile.TemporaryDirectory() as temp_dir:
temp_file = os.path.join(temp_dir, "temp_file")
downloaded_file = self._download_file(file_path, temp_file)
downloaded_file = self._download_file(resource_id, temp_file)
with open(downloaded_file, "rb") as file:
return file.read()

def get_file_name(self, resource_id: str) -> str:
"""Get the file name from the resource ID."""
from googleapiclient.discovery import build

self._creds = self._get_credentials()

service = build("drive", "v3", credentials=self._creds)
file = service.files().get(fileId=resource_id, supportsAllDrives=True).execute()
return file["name"]

def get_file_path(self, resource_id: str) -> str:
"""Get the file path from the resource ID."""
from googleapiclient.discovery import build

self._creds = self._get_credentials()

service = build("drive", "v3", credentials=self._creds)

return self._get_relative_path(
service, file_id=resource_id, folder_id=self.folder_id
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import logging
import os
from pathlib import Path
import tempfile
import time
from typing import Any, Dict, List, Optional, Union
from pathlib import Path

import requests
from llama_index.core.readers import SimpleDirectoryReader
Expand Down Expand Up @@ -767,7 +767,7 @@ def list_resources(
recursive=recursive,
userprincipalname=userprincipalname,
)
return [payload.resource_info["file_path"] for payload in payloads]
return [str(payload.resource_info["file_path"]) for payload in payloads]
except Exception as e:
logger.error(
f"An error occurred while listing resources: {e}", exc_info=True
Expand Down Expand Up @@ -819,20 +819,26 @@ async def aload_resource(
) -> List[Document]:
return self.load_resource(resource_id, *args, **kwargs)

def read_file_content(self, input_file: Path, **kwargs) -> bytes:
def read_file_content(self, resource_id: str, **kwargs) -> bytes:
with tempfile.TemporaryDirectory() as temp_dir:
payloads = self._get_downloaded_files_metadata(
file_paths=[str(input_file)], temp_dir=temp_dir, **kwargs
file_paths=[resource_id], temp_dir=temp_dir, **kwargs
)
local_file_path = next(
payloads.downloaded_file_path
for payloads in payloads
if payloads.resource_info["file_path"] == str(input_file)
if payloads.resource_info["file_path"] == resource_id
)
if not local_file_path:
raise ValueError("File was not downloaded successfully.")
with open(local_file_path, "rb") as f:
return f.read()

async def aread_file_content(self, input_file: Path, **kwargs) -> bytes:
return self.read_file_content(input_file, **kwargs)
async def aread_file_content(self, resource_id: str, **kwargs) -> bytes:
return self.read_file_content(resource_id, **kwargs)

def get_file_name(self, resource_id: str) -> str:
return Path(resource_id).name

def get_file_path(self, resource_id: str) -> str:
return resource_id
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ def list_resources(
sharepoint_folder_id: Optional[str] = None,
sharepoint_site_id: Optional[str] = None,
recursive: bool = True,
) -> List[Path]:
) -> List[str]:
"""
Lists the files in the specified folder in the SharePoint site.

Expand Down Expand Up @@ -738,11 +738,11 @@ def list_resources(
recursive,
os.path.join(sharepoint_site_name, sharepoint_folder_path),
)
file_paths.extend(folder_contents)
file_paths.extend([str(path) for path in folder_contents])
else:
# Fetch drive contents
drive_contents = self._list_drive_contents()
file_paths.extend(drive_contents)
file_paths.extend([str(path) for path in drive_contents])
except Exception as exp:
logger.error("An error occurred while listing files in SharePoint: %s", exp)
raise
Expand Down Expand Up @@ -847,19 +847,25 @@ def load_resource(self, resource_id: str, **kwargs) -> List[Document]:
)
raise

def read_file_content(self, input_file: Path, **kwargs) -> bytes:
def read_file_content(self, resource_id: str, **kwargs) -> bytes:
try:
access_token = self._get_access_token()
self._site_id_with_host_name = self._get_site_id_with_host_name(
access_token, self.sharepoint_site_name
)
self._drive_id = self._get_drive_id()

item = self._get_item_from_path(input_file)
item = self._get_item_from_path(Path(resource_id))
return self._get_file_content_by_url(item)

except Exception as exp:
logger.error(
"An error occurred while reading file content from SharePoint: %s", exp
)
raise

def get_file_name(self, resource_id: str) -> str:
return Path(resource_id).name

def get_file_path(self, resource_id: str) -> str:
return resource_id
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,12 @@ def load_resource(self, resource_id: str, **kwargs) -> List[Document]:
docs = simple_directory_reader.load_resource(resource_id, **kwargs)
return self._adjust_documents(docs)

def read_file_content(self, input_file: Path, **kwargs) -> bytes:
def read_file_content(self, resource_id: str, **kwargs) -> bytes:
simple_directory_reader = self._get_simple_directory_reader()
return simple_directory_reader.read_file_content(input_file, **kwargs)
return simple_directory_reader.read_file_content(Path(resource_id), **kwargs)

def get_file_name(self, resource_id: str) -> str:
return Path(resource_id).name

def get_file_path(self, resource_id: str) -> str:
return resource_id
Loading