Skip to content

Commit

Permalink
fix: UnicodeEncode error when save/load knowledge graph (#1900)
Browse files Browse the repository at this point in the history
  • Loading branch information
jjmachan authored Feb 4, 2025
1 parent 1dd4b29 commit 2605de5
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 4 deletions.
35 changes: 31 additions & 4 deletions src/ragas/testset/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,24 +173,51 @@ def _add_relationship(self, relationship: Relationship):
self.relationships.append(relationship)

def save(self, path: t.Union[str, Path]):
"""Saves the knowledge graph to a JSON file."""
"""Saves the knowledge graph to a JSON file.
Parameters
----------
path : Union[str, Path]
Path where the JSON file should be saved.
Notes
-----
The file is saved using UTF-8 encoding to ensure proper handling of Unicode characters
across different platforms.
"""
if isinstance(path, str):
path = Path(path)

data = {
"nodes": [node.model_dump() for node in self.nodes],
"relationships": [rel.model_dump() for rel in self.relationships],
}
with open(path, "w") as f:
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, cls=UUIDEncoder, indent=2, ensure_ascii=False)

@classmethod
def load(cls, path: t.Union[str, Path]) -> "KnowledgeGraph":
"""Loads a knowledge graph from a path."""
"""Loads a knowledge graph from a path.
Parameters
----------
path : Union[str, Path]
Path to the JSON file containing the knowledge graph.
Returns
-------
KnowledgeGraph
The loaded knowledge graph.
Notes
-----
The file is read using UTF-8 encoding to ensure proper handling of Unicode characters
across different platforms.
"""
if isinstance(path, str):
path = Path(path)

with open(path, "r") as f:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)

nodes = [Node(**node_data) for node_data in data["nodes"]]
Expand Down
62 changes: 62 additions & 0 deletions tests/unit/test_knowledge_graph_save.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from ragas.testset.graph import KnowledgeGraph, Node, NodeType, Relationship


def test_knowledge_graph_save_with_problematic_chars(tmp_path):
# Create a knowledge graph with special characters
kg = KnowledgeGraph()

# Create nodes with various Unicode characters including ones that might cause charmap codec issues
problematic_chars = [
chr(i) for i in range(0x0080, 0x00FF) # Extended ASCII/Latin-1 characters
] + [
"\u2022", # bullet
"\u2192", # arrow
"\u2665", # heart
"\u2605", # star
"\u221E", # infinity
"\u00B5", # micro
"\u2264", # less than or equal
"\u2265", # greater than or equal
"\u0391", # Greek letters
"\u0392",
"\u0393",
"\uFFFF", # Special Unicode characters
]

# Create multiple nodes with combinations of problematic characters
for i, char in enumerate(problematic_chars):
text = f"Test{char}Text with special char at position {i}"
node = Node(
properties={
"text": text,
"description": f"Node {i} with {char}",
"metadata": f"Extra {char} info",
},
type=NodeType.CHUNK,
)
kg.add(node)

# Add some relationships to make it more realistic
nodes = kg.nodes
for i in range(len(nodes) - 1):
rel = Relationship(
source=nodes[i],
target=nodes[i + 1],
type="next",
properties={"info": f"Link {i} with special char {problematic_chars[i]}"},
)
kg.add(rel)

# Try to save to a temporary file
save_path = tmp_path / "test_knowledge_graph.json"
kg.save(str(save_path))

# Try to load it back to verify
loaded_kg = KnowledgeGraph.load(str(save_path))

# Verify the content was preserved
assert len(loaded_kg.nodes) == len(kg.nodes)
assert len(loaded_kg.relationships) == len(kg.relationships)

# Verify the special characters were preserved in the first node
assert loaded_kg.nodes[0].properties["text"] == nodes[0].properties["text"]

0 comments on commit 2605de5

Please sign in to comment.