Skip to content

Commit

Permalink
fix: multibyte entries breaking stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
zleyyij committed Apr 25, 2024
1 parent 2bccaba commit 8baad94
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/scripts/parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ tinyvec = { version = "1", features = ["alloc"]}
indexmap = "2"
encoding = "0.2"
nom = "7.1.3"
zstd = { version = "0.13.1", features = ["wasm"] }
# zstd = { version = "0.13.1", features = ["wasm", "no_asm"], default-features = false }
color-eyre = "0.6.3"
3 changes: 1 addition & 2 deletions src/scripts/parser/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#![allow(soft_unstable)]
mod lang;
mod lexer;
mod parser;

Expand Down Expand Up @@ -113,7 +112,7 @@ mod tests {
#[test]
fn parse_csv_from_file() {
// TODO
let mut file_handle = File::open("/Users/arc/Downloads/asd.csv").unwrap();
let mut file_handle = File::open("/Users/arc/Downloads/help.csv").unwrap();
let mut file_vec = Vec::new();
file_handle.read_to_end(&mut file_vec).unwrap();
parse_csv(&file_vec);
Expand Down
34 changes: 29 additions & 5 deletions src/scripts/parser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pub mod parser {
if column[0] == "Date" {
return;
}
// the date is skipped because it's not really valuble overhead
// the date is skipped because it's not really needed
for entry in &column[1..column.len()] {
// handle special cases
match *entry {
Expand Down Expand Up @@ -84,7 +84,7 @@ pub mod parser {
}
#[cfg(not(wasm))]
{
println!("Failed to parse entry {entry} into a float, skipping");
println!("Failed to parse entry {entry} into a float, skipping column {}", column[0]);
}
}
}
Expand All @@ -95,10 +95,17 @@ pub mod parser {
// because hashmaps can't have two duplicate keys, append (n) to keys that already exit in the map
while map.contains_key(insertion_key.as_str()) {
// select the last 3 chars, and strip parentheses and whitespace, before parsing it to a number
let last_chars = &insertion_key.chars().collect::<Vec<char>>()
[insertion_key.len() - 3..]
// while 3 chars is generally better, fallback to single char to avoid subtraction with overflow
let last_chars = if insertion_key.chars().count() >= 3 {
insertion_key.chars().collect::<Vec<char>>()
// .chars().count() is used here instead of .len() because .len()
// breaks for multibyte chars
[insertion_key.chars().count() - 3..]
.iter()
.collect::<String>();
.collect::<String>()
} else {
insertion_key.to_string()
};
// sometimes there's more than one duplicate
if last_chars.ends_with(")") {
// the "number" found at the end, as a string
Expand Down Expand Up @@ -270,5 +277,22 @@ pub mod parser {

assert_eq!(deserialize_csv(mock_csv), expected_output);
}

#[test]
fn multibyte_utf8_column_header() {
let mock_csv = vec![
vec!["πŸ¦†", "πŸ¦†", "πŸ¦†"],
vec!["0.0", "1.0", "2.0"],
vec!["0.0", "1.0", "2.0"],
vec!["πŸ¦†", "πŸ¦†", "πŸ¦†"],
];
let mut expected_output: HashMap<String, Vec<f64>> = HashMap::new();
expected_output.insert("πŸ¦†".to_owned(), vec![0.0, 0.0]);
expected_output.insert("πŸ¦† (1)".to_owned(), vec![1.0, 1.0]);
expected_output.insert("πŸ¦† (2)".to_owned(), vec![2.0, 2.0]);

assert_eq!(deserialize_csv(mock_csv), expected_output);

}
}
}

0 comments on commit 8baad94

Please sign in to comment.