From 33372697e5f5d1848582488c25a036fa387b60e5 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 01/30] sourcepos: fix(?) HtmlInline. Pretty unsure about the "+ extra" but we'll see! --- src/parser/inlines.rs | 39 ++++++++++++++++++++++++--------------- src/tests/sourcepos.rs | 1 - 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 0e1d0d46..cfe2bb54 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -191,10 +191,10 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { let new_inl: Option<&'a AstNode<'a>> = match c { '\0' => return false, '\r' | '\n' => Some(self.handle_newline()), - '`' => Some(self.handle_backticks()), + '`' => Some(self.handle_backticks(&node_ast.line_offsets)), '\\' => Some(self.handle_backslash()), '&' => Some(self.handle_entity()), - '<' => Some(self.handle_pointy_brace()), + '<' => Some(self.handle_pointy_brace(&node_ast.line_offsets)), ':' => { let mut res = None; @@ -288,7 +288,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { '^' if self.options.extension.superscript && !self.within_brackets => { Some(self.handle_delim(b'^')) } - '$' => Some(self.handle_dollars()), + '$' => Some(self.handle_dollars(&node_ast.line_offsets)), '|' if self.options.extension.spoiler => Some(self.handle_delim(b'|')), _ => { let endpos = self.find_special_char(); @@ -603,11 +603,13 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.input.len() } - fn adjust_node_newlines(&mut self, node: &'a AstNode<'a>, matchlen: usize, extra: usize) { - if !self.options.render.sourcepos { - return; - } - + fn adjust_node_newlines( + &mut self, + node: &'a AstNode<'a>, + matchlen: usize, + extra: usize, + parent_line_offsets: &[usize], + ) { let (newlines, since_newline) = count_newlines(&self.input[self.pos - matchlen - extra..self.pos - extra]); @@ -615,7 +617,9 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.line += newlines; let node_ast = &mut node.data.borrow_mut(); node_ast.sourcepos.end.line += newlines; - node_ast.sourcepos.end.column = since_newline; + let adjusted_line = self.line - node_ast.sourcepos.start.line; + node_ast.sourcepos.end.column = + parent_line_offsets[adjusted_line] + since_newline + extra; self.column_offset = -(self.pos as isize) + since_newline as isize + extra as isize; } } @@ -684,7 +688,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn handle_backticks(&mut self) -> &'a AstNode<'a> { + pub fn handle_backticks(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { let openticks = self.take_while(b'`'); let startpos = self.pos; let endpos = self.scan_to_closing_backtick(openticks); @@ -703,7 +707,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { }; let node = self.make_inline(NodeValue::Code(code), startpos, endpos - openticks - 1); - self.adjust_node_newlines(node, endpos - startpos, openticks); + self.adjust_node_newlines(node, endpos - startpos, openticks, parent_line_offsets); node } } @@ -781,7 +785,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } // Heuristics used from https://pandoc.org/MANUAL.html#extension-tex_math_dollars - pub fn handle_dollars(&mut self) -> &'a AstNode<'a> { + pub fn handle_dollars(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { if self.options.extension.math_dollars || self.options.extension.math_code { let opendollars = self.take_while(b'$'); let mut code_math = false; @@ -849,7 +853,12 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { startpos, endpos - fence_length - 1, ); - self.adjust_node_newlines(node, endpos - startpos, fence_length); + self.adjust_node_newlines( + node, + endpos - startpos, + fence_length, + parent_line_offsets, + ); node } } @@ -1332,7 +1341,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.handle_autolink_with(node, autolink::www_match) } - pub fn handle_pointy_brace(&mut self) -> &'a AstNode<'a> { + pub fn handle_pointy_brace(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { self.pos += 1; if let Some(matchlen) = scanners::autolink_uri(&self.input[self.pos..]) { @@ -1426,7 +1435,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.pos - matchlen - 1, self.pos - 1, ); - self.adjust_node_newlines(inl, matchlen, 1); + self.adjust_node_newlines(inl, matchlen, 1, parent_line_offsets); return inl; } diff --git a/src/tests/sourcepos.rs b/src/tests/sourcepos.rs index a18da6b3..d66bb83c 100644 --- a/src/tests/sourcepos.rs +++ b/src/tests/sourcepos.rs @@ -418,7 +418,6 @@ fn node_values() -> HashMap { | DescriptionItem // end is 4:0 | DescriptionTerm // end is 3:0 | DescriptionDetails // end is 4:0 - | HtmlInline // end is 1:31 but should be 3:14 | LineBreak // start is 1:15 but should be 1:13 | Code // is 1:8-1:12 but should be 1:7-1:13 | ThematicBreak // end is 4:0 From 4729f60f655bfd39af021e01c53f52222aad4d4e Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 02/30] math: fix sourcepos test, mark #[ignore]. For consistency with other inlines, we should include the opening and closing $ / $$ / $`; there shouldn't be characters in the source document not covered by sourcepos. --- src/tests/math.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/tests/math.rs b/src/tests/math.rs index 4136bc76..5920c50d 100644 --- a/src/tests/math.rs +++ b/src/tests/math.rs @@ -134,6 +134,7 @@ fn math_unrecognized_syntax_non_roundtrip(markdown: &str, html: &str) { } #[test] +#[ignore] fn sourcepos() { assert_ast_match!( [extension.math_dollars, extension.math_code], @@ -148,14 +149,14 @@ fn sourcepos() { "```\n", (document (1:1-9:3) [ (paragraph (1:1-1:29) [ - (math (1:2-1:4)) + (math (1:1-1:5)) (text (1:6-1:10) " and ") - (math (1:13-1:15)) + (math (1:11-1:17)) (text (1:18-1:22) " and ") - (math (1:25-1:27)) + (math (1:23-1:29)) ]) (paragraph (3:1-5:2) [ - (math (3:3-5:0)) + (math (3:1-5:2)) ]) (code_block (7:1-9:3)) ]) From 062865ca55d570c94855d9a5cb7ea3e9f1278200 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 03/30] inlines: fix Code sourcepos. --- src/parser/inlines.rs | 22 +++++++++++++++------- src/tests/regressions.rs | 13 +++++++++++++ src/tests/sourcepos.rs | 1 - 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index cfe2bb54..ab8483f6 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -689,25 +689,33 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } pub fn handle_backticks(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { - let openticks = self.take_while(b'`'); let startpos = self.pos; + let openticks = self.take_while(b'`'); let endpos = self.scan_to_closing_backtick(openticks); match endpos { None => { - self.pos = startpos; - self.make_inline(NodeValue::Text("`".repeat(openticks)), self.pos, self.pos) + self.pos = startpos + openticks; + self.make_inline( + NodeValue::Text("`".repeat(openticks)), + startpos, + self.pos - 1, + ) } Some(endpos) => { - let buf = &self.input[startpos..endpos - openticks]; + let buf = &self.input[startpos + openticks..endpos - openticks]; let buf = strings::normalize_code(buf); let code = NodeCode { num_backticks: openticks, literal: String::from_utf8(buf).unwrap(), }; - let node = - self.make_inline(NodeValue::Code(code), startpos, endpos - openticks - 1); - self.adjust_node_newlines(node, endpos - startpos, openticks, parent_line_offsets); + let node = self.make_inline(NodeValue::Code(code), startpos, endpos - 1); + self.adjust_node_newlines( + node, + endpos - startpos - openticks, + openticks, + parent_line_offsets, + ); node } } diff --git a/src/tests/regressions.rs b/src/tests/regressions.rs index f0827134..a2456aac 100644 --- a/src/tests/regressions.rs +++ b/src/tests/regressions.rs @@ -133,3 +133,16 @@ fn sourcepos_para() { fn gemoji() { html_opts!([extension.shortcodes], ":x:", "

\n"); } + +#[test] +fn sourcepos_lone_backtick() { + assert_ast_match!( + [], + "``\n", + (document (1:1-1:2) [ + (paragraph (1:1-1:2) [ + (text (1:1-1:2) "``") + ]) + ]) + ); +} diff --git a/src/tests/sourcepos.rs b/src/tests/sourcepos.rs index d66bb83c..8f138c3a 100644 --- a/src/tests/sourcepos.rs +++ b/src/tests/sourcepos.rs @@ -419,7 +419,6 @@ fn node_values() -> HashMap { | DescriptionTerm // end is 3:0 | DescriptionDetails // end is 4:0 | LineBreak // start is 1:15 but should be 1:13 - | Code // is 1:8-1:12 but should be 1:7-1:13 | ThematicBreak // end is 4:0 | Link // inconsistent between link types | Math // is 3:2-3:6 but should be 3:1-3:7 From 888e521f93fd021e04eab01cdde84f60248a8a3e Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 04/30] inlines: fix (one) LineBreak sourcepos. Need to check the other kind. --- src/parser/inlines.rs | 2 +- src/tests/sourcepos.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index ab8483f6..52ae348c 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -633,7 +633,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.pos += 1; } let inl = if nlpos > 1 && self.input[nlpos - 1] == b' ' && self.input[nlpos - 2] == b' ' { - self.make_inline(NodeValue::LineBreak, nlpos, self.pos - 1) + self.make_inline(NodeValue::LineBreak, nlpos - 2, self.pos - 1) } else { self.make_inline(NodeValue::SoftBreak, nlpos, self.pos - 1) }; diff --git a/src/tests/sourcepos.rs b/src/tests/sourcepos.rs index 8f138c3a..06b919fc 100644 --- a/src/tests/sourcepos.rs +++ b/src/tests/sourcepos.rs @@ -418,7 +418,6 @@ fn node_values() -> HashMap { | DescriptionItem // end is 4:0 | DescriptionTerm // end is 3:0 | DescriptionDetails // end is 4:0 - | LineBreak // start is 1:15 but should be 1:13 | ThematicBreak // end is 4:0 | Link // inconsistent between link types | Math // is 3:2-3:6 but should be 3:1-3:7 From f8abcacf0bc1b8518e75ee0eddf810500b8e8dfb Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 05/30] tests: test other LineBreak sourcepos. --- src/tests/sourcepos.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/tests/sourcepos.rs b/src/tests/sourcepos.rs index 06b919fc..4b703af8 100644 --- a/src/tests/sourcepos.rs +++ b/src/tests/sourcepos.rs @@ -266,7 +266,10 @@ hello world ); const SOFT_BREAK: TestCase = (&[sourcepos!((1:13-1:13))], "stuff before\nstuff after"); -const LINE_BREAK: TestCase = (&[sourcepos!((1:13-1:15))], "stuff before \nstuff after"); +const LINE_BREAK: TestCase = ( + &[sourcepos!((1:13-1:15)), sourcepos!((4:13-4:14))], + "stuff before \nstuff after\n\nstuff before\\\nstuff after\n", +); const CODE: TestCase = (&[sourcepos!((1:7-1:13))], "hello `world`"); From 447443bb60bb76ec1ffda5ef2d6352e9d2706a42 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 06/30] inlines: fix basic autolinks. (Tests failing other links.) --- src/parser/inlines.rs | 4 ++-- src/tests/sourcepos.rs | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 52ae348c..1016e74a 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -1990,8 +1990,8 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { url: String::from_utf8(strings::clean_autolink(url, kind)).unwrap(), title: String::new(), }), - start_column + 1, - end_column + 1, + start_column, + end_column, ); inl.append(self.make_inline( NodeValue::Text(String::from_utf8(entity::unescape_html(url)).unwrap()), diff --git a/src/tests/sourcepos.rs b/src/tests/sourcepos.rs index 4b703af8..4811f4f2 100644 --- a/src/tests/sourcepos.rs +++ b/src/tests/sourcepos.rs @@ -422,7 +422,6 @@ fn node_values() -> HashMap { | DescriptionTerm // end is 3:0 | DescriptionDetails // end is 4:0 | ThematicBreak // end is 4:0 - | Link // inconsistent between link types | Math // is 3:2-3:6 but should be 3:1-3:7 | Raw // unparseable ) From 68b2b9ae2d5d9b36c7115cbce7ad4ff43e8db4f0 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 07/30] inlines: fix sourcepos for colon/w autolinks; add tests (email still broken). --- src/parser/inlines.rs | 30 +++++++++++++++++++++++++----- src/tests/sourcepos.rs | 4 ++++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 1016e74a..e3dc237b 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -1304,18 +1304,19 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { if !self.options.parse.relaxed_autolinks && self.within_brackets { return None; } - let (post, mut reverse, skip) = f( + let startpos = self.pos; + let (post, need_reverse, skip) = f( self.arena, self.input, self.pos, self.options.parse.relaxed_autolinks, )?; - self.pos += skip - reverse; + self.pos += skip - need_reverse; - // We need to "rewind" by `reverse` chars, which should be in one or - // more Text nodes beforehand. Typically the chars will *all* be in a - // single Text node, containing whatever text came before the ":" that + // We need to "rewind" by `need_reverse` chars, which should be in one + // or more Text nodes beforehand. Typically the chars will *all* be in + // a single Text node, containing whatever text came before the ":" that // triggered this method, eg. "See our website at http" ("://blah.com"). // // relaxed_autolinks allows some slightly pathological cases. First, @@ -1323,6 +1324,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { // a scheme including the letter "w", which will split Text inlines due // to them being their own trigger (for handle_autolink_w), meaning // "wa://…" will need to traverse two Texts to complete the rewind. + let mut reverse = need_reverse; while reverse > 0 { match node.last_child().unwrap().data.borrow_mut().value { NodeValue::Text(ref mut prev) => { @@ -1338,6 +1340,24 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } + { + let sp = &mut post.data.borrow_mut().sourcepos; + // See [`make_inline`]. + sp.start = ( + self.line, + (startpos as isize - need_reverse as isize + + 1 + + self.column_offset + + self.line_offset as isize) as usize, + ) + .into(); + sp.end = ( + self.line, + (self.pos as isize + self.column_offset + self.line_offset as isize) as usize, + ) + .into(); + } + Some(post) } diff --git a/src/tests/sourcepos.rs b/src/tests/sourcepos.rs index 4811f4f2..896df75f 100644 --- a/src/tests/sourcepos.rs +++ b/src/tests/sourcepos.rs @@ -305,12 +305,16 @@ const LINK: TestCase = ( sourcepos!((3:7-3:11)), sourcepos!((4:7-4:16)), sourcepos!((5:7-5:29)), + sourcepos!((6:7-6:21)), + sourcepos!((7:7-7:21)), ], r#"hello world hello [foo](https://example.com) world hello [foo] world hello [bar][bar] world hello https://example.com/foo world +hello www.example.com world +hello foo@example.com world [foo]: https://example.com [bar]: https://example.com"#, From 19bcb5eea694da49554f07a0caf10c003a3cc031 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 08/30] inlines: more autolink fixes (still no email). Need to set the inner text sourcepos, as well as propagate any rewind adjustments to previous nodes' sourcepos. --- src/parser/autolink.rs | 3 ++- src/parser/inlines.rs | 8 +++++++- src/tests/autolink.rs | 25 +++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index 456f9293..f86f6c83 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -7,7 +7,8 @@ use typed_arena::Arena; use unicode_categories::UnicodeCategories; // TODO: this can probably be cleaned up a lot. It used to handle all three of -// {url,www,email}_match, but now just the last of those. +// {url,www,email}_match, but now just the last of those. (This is still per +// upstream cmark-gfm, so it's not easily changed without breaking compat.) pub(crate) fn process_autolinks<'a>( arena: &'a Arena>, node: &'a AstNode<'a>, diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index e3dc237b..116ccf05 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -1326,10 +1326,12 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { // "wa://…" will need to traverse two Texts to complete the rewind. let mut reverse = need_reverse; while reverse > 0 { - match node.last_child().unwrap().data.borrow_mut().value { + let mut last_child = node.last_child().unwrap().data.borrow_mut(); + match last_child.value { NodeValue::Text(ref mut prev) => { if reverse < prev.len() { prev.truncate(prev.len() - reverse); + last_child.sourcepos.end.column -= reverse; reverse = 0; } else { reverse -= prev.len(); @@ -1356,6 +1358,10 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { (self.pos as isize + self.column_offset + self.line_offset as isize) as usize, ) .into(); + + // Inner text node gets the same sp, since there are no surrounding + // characters for autolinks of these kind. + post.first_child().unwrap().data.borrow_mut().sourcepos = *sp; } Some(post) diff --git a/src/tests/autolink.rs b/src/tests/autolink.rs index c4363a14..1e7a89a2 100644 --- a/src/tests/autolink.rs +++ b/src/tests/autolink.rs @@ -395,3 +395,28 @@ fn autolink_fuzz_we() { no_roundtrip, ); } + +#[test] +fn autolink_sourcepos() { + assert_ast_match!( + [extension.autolink], + "a www.com\n" + "\n" + "b https://www.com\n" + , + (document (1:1-3:18) [ + (paragraph (1:1-1:10) [ + (text (1:1-1:3) "a ") + (link (1:4-1:10) [ + (text (1:4-1:10) "www.com") + ]) + ]) + (paragraph (3:1-3:18) [ + (text (3:1-3:3) "b ") + (link (3:4-3:18) [ + (text (3:4-3:18) "https://www.com") + ]) + ]) + ]) + ); +} From ffd8159db68b35021fd9ef6f8d314a976ee4d3b4 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 09/30] autolink: fix a bunch of sourcepos! --- src/parser/autolink.rs | 258 +++++++++++++++++++++++------------------ src/parser/mod.rs | 1 + src/tests/autolink.rs | 35 ++++-- 3 files changed, 166 insertions(+), 128 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index f86f6c83..e9c8369b 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -1,6 +1,6 @@ use crate::character_set::character_set; use crate::ctype::{isalnum, isalpha, isspace}; -use crate::nodes::{AstNode, NodeLink, NodeValue}; +use crate::nodes::{AstNode, NodeLink, NodeValue, Sourcepos}; use crate::parser::inlines::make_inline; use std::str; use typed_arena::Arena; @@ -14,6 +14,7 @@ pub(crate) fn process_autolinks<'a>( node: &'a AstNode<'a>, contents_str: &mut String, relaxed_autolinks: bool, + sourcepos: &mut Sourcepos, ) { let contents = contents_str.as_bytes(); let len = contents.len(); @@ -60,15 +61,154 @@ pub(crate) fn process_autolinks<'a>( post.insert_after(make_inline( arena, NodeValue::Text(remain.to_string()), - (0, 1, 0, 1).into(), + ( + sourcepos.end.line, + sourcepos.end.column + 1 - (len - i - skip), + sourcepos.end.line, + sourcepos.end.column, + ) + .into(), )); } + + // Enormous HACK. This compensates for our lack of accounting for + // smart puncutation in sourcepos in exactly one extant regression + // test (tests::fuzz::trailing_hyphen_matches), but it is no + // substitute, probably fails miserably in other contexts, and we + // really instead just need an actual solution to that issue. + sourcepos.end.column -= str::from_utf8(&contents[i..]).unwrap().chars().count(); + contents_str.truncate(i); + let nsp: Sourcepos = ( + sourcepos.end.line, + sourcepos.end.column + 1, + sourcepos.end.line, + sourcepos.end.column + skip, + ) + .into(); + post.data.borrow_mut().sourcepos = nsp; + // Inner text gets same sourcepos as link, since there's nothing but + // the text. + post.first_child().unwrap().data.borrow_mut().sourcepos = nsp; return; } } } +fn email_match<'a>( + arena: &'a Arena>, + contents: &[u8], + i: usize, + relaxed_autolinks: bool, +) -> Option<(&'a AstNode<'a>, usize, usize)> { + const EMAIL_OK_SET: [bool; 256] = character_set!(b".+-_"); + + let size = contents.len(); + + let mut auto_mailto = true; + let mut is_xmpp = false; + let mut rewind = 0; + + while rewind < i { + let c = contents[i - rewind - 1]; + + if isalnum(c) || EMAIL_OK_SET[c as usize] { + rewind += 1; + continue; + } + + if c == b':' { + if validate_protocol("mailto", contents, i - rewind - 1) { + auto_mailto = false; + rewind += 1; + continue; + } + + if validate_protocol("xmpp", contents, i - rewind - 1) { + is_xmpp = true; + auto_mailto = false; + rewind += 1; + continue; + } + } + + break; + } + + if rewind == 0 { + return None; + } + + let mut link_end = 1; + let mut np = 0; + + while link_end < size - i { + let c = contents[i + link_end]; + + if isalnum(c) { + // empty + } else if c == b'@' { + return None; + } else if c == b'.' && link_end < size - i - 1 && isalnum(contents[i + link_end + 1]) { + np += 1; + } else if c == b'/' && is_xmpp { + // xmpp allows a `/` in the url + } else if c != b'-' && c != b'_' { + break; + } + + link_end += 1; + } + + if link_end < 2 + || np == 0 + || (!isalpha(contents[i + link_end - 1]) && contents[i + link_end - 1] != b'.') + { + return None; + } + + link_end = autolink_delim(&contents[i..], link_end, relaxed_autolinks); + if link_end == 0 { + return None; + } + + let mut url = if auto_mailto { + "mailto:".to_string() + } else { + "".to_string() + }; + let text = str::from_utf8(&contents[i - rewind..link_end + i]).unwrap(); + url.push_str(text); + + let inl = make_inline( + arena, + NodeValue::Link(NodeLink { + url, + title: String::new(), + }), + (0, 1, 0, 1).into(), + ); + + inl.append(make_inline( + arena, + NodeValue::Text(text.to_string()), + (0, 1, 0, 1).into(), + )); + Some((inl, rewind, rewind + link_end)) +} + +fn validate_protocol(protocol: &str, contents: &[u8], cursor: usize) -> bool { + let size = contents.len(); + let mut rewind = 0; + + while rewind < cursor && isalpha(contents[cursor - rewind - 1]) { + rewind += 1; + } + + size - cursor + rewind >= protocol.len() + && &contents[cursor - rewind..cursor] == protocol.as_bytes() +} + pub fn www_match<'a>( arena: &'a Arena>, contents: &[u8], @@ -293,117 +433,3 @@ pub fn url_match<'a>( )); Some((inl, rewind, rewind + link_end)) } - -fn email_match<'a>( - arena: &'a Arena>, - contents: &[u8], - i: usize, - relaxed_autolinks: bool, -) -> Option<(&'a AstNode<'a>, usize, usize)> { - const EMAIL_OK_SET: [bool; 256] = character_set!(b".+-_"); - - let size = contents.len(); - - let mut auto_mailto = true; - let mut is_xmpp = false; - let mut rewind = 0; - - while rewind < i { - let c = contents[i - rewind - 1]; - - if isalnum(c) || EMAIL_OK_SET[c as usize] { - rewind += 1; - continue; - } - - if c == b':' { - if validate_protocol("mailto", contents, i - rewind - 1) { - auto_mailto = false; - rewind += 1; - continue; - } - - if validate_protocol("xmpp", contents, i - rewind - 1) { - is_xmpp = true; - auto_mailto = false; - rewind += 1; - continue; - } - } - - break; - } - - if rewind == 0 { - return None; - } - - let mut link_end = 1; - let mut np = 0; - - while link_end < size - i { - let c = contents[i + link_end]; - - if isalnum(c) { - // empty - } else if c == b'@' { - return None; - } else if c == b'.' && link_end < size - i - 1 && isalnum(contents[i + link_end + 1]) { - np += 1; - } else if c == b'/' && is_xmpp { - // xmpp allows a `/` in the url - } else if c != b'-' && c != b'_' { - break; - } - - link_end += 1; - } - - if link_end < 2 - || np == 0 - || (!isalpha(contents[i + link_end - 1]) && contents[i + link_end - 1] != b'.') - { - return None; - } - - link_end = autolink_delim(&contents[i..], link_end, relaxed_autolinks); - if link_end == 0 { - return None; - } - - let mut url = if auto_mailto { - "mailto:".to_string() - } else { - "".to_string() - }; - let text = str::from_utf8(&contents[i - rewind..link_end + i]).unwrap(); - url.push_str(text); - - let inl = make_inline( - arena, - NodeValue::Link(NodeLink { - url, - title: String::new(), - }), - (0, 1, 0, 1).into(), - ); - - inl.append(make_inline( - arena, - NodeValue::Text(text.to_string()), - (0, 1, 0, 1).into(), - )); - Some((inl, rewind, rewind + link_end)) -} - -fn validate_protocol(protocol: &str, contents: &[u8], cursor: usize) -> bool { - let size = contents.len(); - let mut rewind = 0; - - while rewind < cursor && isalpha(contents[cursor - rewind - 1]) { - rewind += 1; - } - - size - cursor + rewind >= protocol.len() - && &contents[cursor - rewind..cursor] == protocol.as_bytes() -} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index f2bb7ace..b27654ed 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3014,6 +3014,7 @@ where node, text, self.options.parse.relaxed_autolinks, + sourcepos, ); } } diff --git a/src/tests/autolink.rs b/src/tests/autolink.rs index 1e7a89a2..36f524b8 100644 --- a/src/tests/autolink.rs +++ b/src/tests/autolink.rs @@ -259,9 +259,9 @@ fn sourcepos_correctly_restores_context() { // wasn't equal. That probably wouldn't happen, though -- i.e. we're // never autolinking into the middle of a rendered smart punctuation. // - // For now the desired sourcepos is documented in comment. What we - // have currently (after backing out the adjustments, having hit the - // above case) matches cmark-gfm. + // The below documents test cases that used to break; I suspect smart + // punctuation is still a breaking case however! + assert_ast_match!( [], "ab _cde_ f@g.ee h*ijklm* n", @@ -289,11 +289,11 @@ fn sourcepos_correctly_restores_context() { (emph (1:4-1:8) [ (text (1:5-1:7) "cde") ]) - (text (1:9-1:17) " ") // (text (1:9-1:9) " ") - (link (XXX) [ // (link (1:10-1:15) [ - (text (XXX) "f@g.ee") // (text (1:10-1:15) "f@g.ee") + (text (1:9-1:9) " ") + (link (1:10-1:15) [ + (text (1:10-1:15) "f@g.ee") ]) - (text (XXX) " h") // (text (1:16-1:17) " h") + (text (1:16-1:17) " h") (emph (1:18-1:24) [ (text (1:19-1:23) "ijklm") ]) @@ -400,22 +400,33 @@ fn autolink_fuzz_we() { fn autolink_sourcepos() { assert_ast_match!( [extension.autolink], - "a www.com\n" + "a www.com x\n" + "\n" + "b https://www.com y\n" "\n" - "b https://www.com\n" + "c foo@www.com z\n" , - (document (1:1-3:18) [ - (paragraph (1:1-1:10) [ + (document (1:1-5:17) [ + (paragraph (1:1-1:13) [ (text (1:1-1:3) "a ") (link (1:4-1:10) [ (text (1:4-1:10) "www.com") ]) + (text (1:11-1:13) " x") ]) - (paragraph (3:1-3:18) [ + (paragraph (3:1-3:21) [ (text (3:1-3:3) "b ") (link (3:4-3:18) [ (text (3:4-3:18) "https://www.com") ]) + (text (3:19-3:21) " y") + ]) + (paragraph (5:1-5:17) [ + (text (5:1-5:3) "c ") + (link (5:4-5:14) [ + (text (5:4-5:14) "foo@www.com") + ]) + (text (5:15-5:17) " z") ]) ]) ); From 220a902b673a345646f03c97b69d7b1b5e839772 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 10/30] parser: fix ThematicBreak sourcepos end. --- src/parser/mod.rs | 6 ++++++ src/tests/sourcepos.rs | 5 ++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index b27654ed..1b724866 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1851,6 +1851,7 @@ where *container = self.add_child(container, NodeValue::ThematicBreak, self.first_nonspace + 1); let adv = line.len() - 1 - self.offset; + container.data.borrow_mut().sourcepos.end = (self.line_number, adv).into(); self.advance_offset(line, adv, false); true @@ -2708,6 +2709,11 @@ where _ => false, } { ast.sourcepos.end = (self.line_number, self.curline_end_col).into(); + } else if match ast.value { + NodeValue::ThematicBreak => true, + _ => false, + } { + // sourcepos.end set during opening. } else { ast.sourcepos.end = (self.line_number - 1, self.last_line_length).into(); } diff --git a/src/tests/sourcepos.rs b/src/tests/sourcepos.rs index 896df75f..76fc0628 100644 --- a/src/tests/sourcepos.rs +++ b/src/tests/sourcepos.rs @@ -161,10 +161,10 @@ hello world ); const THEMATIC_BREAK: TestCase = ( - &[sourcepos!((3:1-3:3))], + &[sourcepos!((3:2-3:4))], r#"Hello ---- + --- World"#, ); @@ -425,7 +425,6 @@ fn node_values() -> HashMap { | DescriptionItem // end is 4:0 | DescriptionTerm // end is 3:0 | DescriptionDetails // end is 4:0 - | ThematicBreak // end is 4:0 | Math // is 3:2-3:6 but should be 3:1-3:7 | Raw // unparseable ) From b92a2ebb55751df11318ab3aa0116bfd2e204411 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Wed, 26 Feb 2025 13:44:24 +1100 Subject: [PATCH 11/30] tests: document list/item sourcepos issues. This one will be a bit of a pickle, and entails differing from cmark-gfm (which currently gives these "-4:0" sourcepos ends for the list and last item on the same test). --- src/tests/regressions.rs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/tests/regressions.rs b/src/tests/regressions.rs index a2456aac..8d54a217 100644 --- a/src/tests/regressions.rs +++ b/src/tests/regressions.rs @@ -146,3 +146,34 @@ fn sourcepos_lone_backtick() { ]) ); } + +#[ignore] // This one will require a bit of thinking. +#[test] +fn sourcepos_link_items() { + assert_ast_match!( + [], + "- ab\n" + "- cdef\n" + "\n" + "\n" + "g\n" + , + (document (1:1-5:1) [ + (list (1:1-2:6) [ + (item (1:1-1:4) [ + (paragraph (1:3-1:4) [ + (text (1:3-1:4) "ab") + ]) + ]) + (item (2:1-2:6) [ + (paragraph (2:3-2:6) [ + (text (2:3-2:6) "cdef") + ]) + ]) + ]) + (paragraph (5:1-5:1) [ + (text (5:1-5:1) "g") + ]) + ]) + ); +} From 30abc1c3b66879326d752457cfa5a5b3ee751755 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 13:06:21 +1100 Subject: [PATCH 12/30] inlines: fix Math sourcepos. Test some more content literals, too. --- src/parser/inlines.rs | 159 ++++++++++++++++++----------------------- src/tests.rs | 18 +++-- src/tests/math.rs | 11 ++- src/tests/sourcepos.rs | 1 - 4 files changed, 86 insertions(+), 103 deletions(-) diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 116ccf05..b465100d 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -721,8 +721,8 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn scan_to_closing_dollar(&mut self, opendollarlength: usize) -> Option { - if !(self.options.extension.math_dollars) || opendollarlength > MAX_MATH_DOLLARS { + fn scan_to_closing_dollar(&mut self, opendollarlength: usize) -> Option { + if !self.options.extension.math_dollars || opendollarlength > MAX_MATH_DOLLARS { return None; } @@ -740,17 +740,15 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { return None; } + let c = self.input[self.pos - 1]; + // space not allowed before ending $ - if opendollarlength == 1 { - let c = self.input[self.pos - 1]; - if isspace(c) { - return None; - } + if opendollarlength == 1 && isspace(c) { + return None; } // dollar signs must also be backslash-escaped if they occur within math - let c = self.input[self.pos - 1]; - if opendollarlength == 1 && c == (b'\\') { + if opendollarlength == 1 && c == b'\\' { self.pos += 1; continue; } @@ -768,10 +766,8 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn scan_to_closing_code_dollar(&mut self) -> Option { - if !self.options.extension.math_code { - return None; - } + fn scan_to_closing_code_dollar(&mut self) -> Option { + assert!(self.options.extension.math_code); loop { while self.peek_char().map_or(false, |&c| c != b'$') { @@ -783,96 +779,77 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } let c = self.input[self.pos - 1]; + self.pos += 1; if c == b'`' { - self.pos += 1; return Some(self.pos); - } else { - self.pos += 1; } } } // Heuristics used from https://pandoc.org/MANUAL.html#extension-tex_math_dollars - pub fn handle_dollars(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { - if self.options.extension.math_dollars || self.options.extension.math_code { - let opendollars = self.take_while(b'$'); - let mut code_math = false; - - // check for code math - if opendollars == 1 - && self.options.extension.math_code - && self.peek_char().map_or(false, |&c| c == b'`') - { - code_math = true; - self.pos += 1; - } + fn handle_dollars(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { + if !(self.options.extension.math_dollars || self.options.extension.math_code) { + self.pos += 1; + return self.make_inline(NodeValue::Text("$".to_string()), self.pos - 1, self.pos - 1); + } + let startpos = self.pos; + let opendollars = self.take_while(b'$'); + let mut code_math = false; - let startpos = self.pos; - let endpos: Option = if code_math { - self.scan_to_closing_code_dollar() - } else { - self.scan_to_closing_dollar(opendollars) - }; + // check for code math + if opendollars == 1 + && self.options.extension.math_code + && self.peek_char().map_or(false, |&c| c == b'`') + { + code_math = true; + self.pos += 1; + } + let fence_length = if code_math { 2 } else { opendollars }; - let fence_length = if code_math { 2 } else { opendollars }; - let endpos: Option = match endpos { - Some(epos) => { - if epos - startpos + fence_length < fence_length * 2 + 1 { - None - } else { - endpos - } - } - None => endpos, - }; + let endpos: Option = if code_math { + self.scan_to_closing_code_dollar() + } else { + self.scan_to_closing_dollar(opendollars) + } + .filter(|epos| epos - startpos >= fence_length * 2 + 1); - match endpos { - None => { - if code_math { - self.pos = startpos - 1; - self.make_inline( - NodeValue::Text("$".to_string()), - self.pos - 1, - self.pos - 1, - ) - } else { - self.pos = startpos; - self.make_inline( - NodeValue::Text("$".repeat(opendollars)), - self.pos, - self.pos, - ) - } - } - Some(endpos) => { - let buf = &self.input[startpos..endpos - fence_length]; - let buf: Vec = if code_math || opendollars == 1 { - strings::normalize_code(buf) - } else { - buf.to_vec() - }; - let math = NodeMath { - dollar_math: !code_math, - display_math: opendollars == 2, - literal: String::from_utf8(buf).unwrap(), - }; - let node = self.make_inline( - NodeValue::Math(math), - startpos, - endpos - fence_length - 1, - ); - self.adjust_node_newlines( - node, - endpos - startpos, - fence_length, - parent_line_offsets, - ); - node + eprintln!("** startpos is {:?}, endpos is {:?}", startpos, endpos); + + match endpos { + None => { + if code_math { + self.pos = startpos + 1; + self.make_inline(NodeValue::Text("$".to_string()), self.pos - 1, self.pos - 1) + } else { + self.pos = startpos + fence_length; + self.make_inline( + NodeValue::Text("$".repeat(opendollars)), + self.pos - fence_length, + self.pos - 1, + ) } } - } else { - self.pos += 1; - self.make_inline(NodeValue::Text("$".to_string()), self.pos - 1, self.pos - 1) + Some(endpos) => { + let buf = &self.input[startpos + fence_length..endpos - fence_length]; + let buf: Vec = if code_math || opendollars == 1 { + strings::normalize_code(buf) + } else { + buf.to_vec() + }; + let math = NodeMath { + dollar_math: !code_math, + display_math: opendollars == 2, + literal: String::from_utf8(buf).unwrap(), + }; + let node = self.make_inline(NodeValue::Math(math), startpos, endpos - 1); + self.adjust_node_newlines( + node, + endpos - startpos - fence_length, + fence_length, + parent_line_offsets, + ); + node + } } } diff --git a/src/tests.rs b/src/tests.rs index ea66f974..0c4cd950 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -387,11 +387,19 @@ impl AstMatchTree { node.children().count(), "text node should have no children" ); - assert_eq!( - text, - ast.value.text().unwrap(), - "text node content should match" - ); + match ast.value { + NodeValue::Math(ref nm) => { + assert_eq!(text, &nm.literal, "math node content should match"); + } + NodeValue::CodeBlock(ref ncb) => { + assert_eq!(text, &ncb.literal, "code block node content should match") + } + _ => assert_eq!( + text, + ast.value.text().unwrap(), + "text node content should match" + ), + } } AstMatchContent::Children(children) => { assert_eq!( diff --git a/src/tests/math.rs b/src/tests/math.rs index 5920c50d..b0a0b54d 100644 --- a/src/tests/math.rs +++ b/src/tests/math.rs @@ -134,7 +134,6 @@ fn math_unrecognized_syntax_non_roundtrip(markdown: &str, html: &str) { } #[test] -#[ignore] fn sourcepos() { assert_ast_match!( [extension.math_dollars, extension.math_code], @@ -149,16 +148,16 @@ fn sourcepos() { "```\n", (document (1:1-9:3) [ (paragraph (1:1-1:29) [ - (math (1:1-1:5)) + (math (1:1-1:5) "x^2") (text (1:6-1:10) " and ") - (math (1:11-1:17)) + (math (1:11-1:17) "y^2") (text (1:18-1:22) " and ") - (math (1:23-1:29)) + (math (1:23-1:29) "z^2") ]) (paragraph (3:1-5:2) [ - (math (3:1-5:2)) + (math (3:1-5:2) "\na^2\n") ]) - (code_block (7:1-9:3)) + (code_block (7:1-9:3) "b^2\n") ]) ); } diff --git a/src/tests/sourcepos.rs b/src/tests/sourcepos.rs index 76fc0628..19aeb619 100644 --- a/src/tests/sourcepos.rs +++ b/src/tests/sourcepos.rs @@ -425,7 +425,6 @@ fn node_values() -> HashMap { | DescriptionItem // end is 4:0 | DescriptionTerm // end is 3:0 | DescriptionDetails // end is 4:0 - | Math // is 3:2-3:6 but should be 3:1-3:7 | Raw // unparseable ) }) From 31b07ffcf1c9b8477c2260d6f1847716cdcdcc83 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 13:06:21 +1100 Subject: [PATCH 13/30] inlines: mark some funs as not pub. --- src/parser/inlines.rs | 54 +++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index b465100d..25df673c 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -566,7 +566,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } #[inline] - pub fn eof(&self) -> bool { + fn eof(&self) -> bool { self.pos >= self.input.len() } @@ -586,7 +586,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn find_special_char(&self) -> usize { + fn find_special_char(&self) -> usize { for n in self.pos..self.input.len() { if self.special_chars[self.input[n] as usize] { if self.input[n] == b'^' && self.within_brackets { @@ -624,7 +624,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn handle_newline(&mut self) -> &'a AstNode<'a> { + fn handle_newline(&mut self) -> &'a AstNode<'a> { let nlpos = self.pos; if self.input[self.pos] == b'\r' { self.pos += 1; @@ -643,7 +643,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { inl } - pub fn take_while(&mut self, c: u8) -> usize { + fn take_while(&mut self, c: u8) -> usize { let start_pos = self.pos; while self.peek_char() == Some(&c) { self.pos += 1; @@ -651,7 +651,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.pos - start_pos } - pub fn take_while_with_limit(&mut self, c: u8, limit: usize) -> usize { + fn take_while_with_limit(&mut self, c: u8, limit: usize) -> usize { let start_pos = self.pos; let mut count = 0; while count < limit && self.peek_char() == Some(&c) { @@ -661,7 +661,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.pos - start_pos } - pub fn scan_to_closing_backtick(&mut self, openticklength: usize) -> Option { + fn scan_to_closing_backtick(&mut self, openticklength: usize) -> Option { if openticklength > MAXBACKTICKS { return None; } @@ -688,7 +688,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn handle_backticks(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { + fn handle_backticks(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { let startpos = self.pos; let openticks = self.take_while(b'`'); let endpos = self.scan_to_closing_backtick(openticks); @@ -813,8 +813,6 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } .filter(|epos| epos - startpos >= fence_length * 2 + 1); - eprintln!("** startpos is {:?}, endpos is {:?}", startpos, endpos); - match endpos { None => { if code_math { @@ -862,7 +860,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { skipped } - pub fn handle_delim(&mut self, c: u8) -> &'a AstNode<'a> { + fn handle_delim(&mut self, c: u8) -> &'a AstNode<'a> { let (numdelims, can_open, can_close) = self.scan_delims(c); let contents = if c == b'\'' && self.options.parse.smart { @@ -891,7 +889,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { inl } - pub fn handle_hyphen(&mut self) -> &'a AstNode<'a> { + fn handle_hyphen(&mut self) -> &'a AstNode<'a> { let start = self.pos; self.pos += 1; @@ -924,7 +922,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.make_inline(NodeValue::Text(buf), start, self.pos - 1) } - pub fn handle_period(&mut self) -> &'a AstNode<'a> { + fn handle_period(&mut self) -> &'a AstNode<'a> { self.pos += 1; if self.options.parse.smart && self.peek_char().map_or(false, |&c| c == b'.') { self.pos += 1; @@ -943,7 +941,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn scan_delims(&mut self, c: u8) -> (usize, bool, bool) { + fn scan_delims(&mut self, c: u8) -> (usize, bool, bool) { let before_char = if self.pos == 0 { '\n' } else { @@ -1037,7 +1035,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn push_delimiter(&mut self, c: u8, can_open: bool, can_close: bool, inl: &'a AstNode<'a>) { + fn push_delimiter(&mut self, c: u8, can_open: bool, can_close: bool, inl: &'a AstNode<'a>) { let d = self.delimiter_arena.alloc(Delimiter { prev: Cell::new(self.last_delimiter), next: Cell::new(None), @@ -1059,7 +1057,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { // // As a side-effect, handle long "***" and "___" nodes by truncating them in // place to be re-matched by `process_emphasis`. - pub fn insert_emph( + fn insert_emph( &mut self, opener: &'d Delimiter<'a, 'd>, closer: &'d Delimiter<'a, 'd>, @@ -1188,7 +1186,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn handle_backslash(&mut self) -> &'a AstNode<'a> { + fn handle_backslash(&mut self) -> &'a AstNode<'a> { let startpos = self.pos; self.pos += 1; @@ -1231,7 +1229,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.pos > old_pos || self.eof() } - pub fn handle_entity(&mut self) -> &'a AstNode<'a> { + fn handle_entity(&mut self) -> &'a AstNode<'a> { self.pos += 1; match entity::unescape(&self.input[self.pos..]) { @@ -1248,7 +1246,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } #[cfg(feature = "shortcodes")] - pub fn handle_shortcodes_colon(&mut self) -> Option<&'a AstNode<'a>> { + fn handle_shortcodes_colon(&mut self) -> Option<&'a AstNode<'a>> { let matchlen = scanners::shortcode(&self.input[self.pos + 1..])?; let shortcode = unsafe { @@ -1265,11 +1263,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { )) } - pub fn handle_autolink_with( - &mut self, - node: &'a AstNode<'a>, - f: F, - ) -> Option<&'a AstNode<'a>> + fn handle_autolink_with(&mut self, node: &'a AstNode<'a>, f: F) -> Option<&'a AstNode<'a>> where F: Fn( &'a Arena>, @@ -1344,15 +1338,15 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { Some(post) } - pub fn handle_autolink_colon(&mut self, node: &'a AstNode<'a>) -> Option<&'a AstNode<'a>> { + fn handle_autolink_colon(&mut self, node: &'a AstNode<'a>) -> Option<&'a AstNode<'a>> { self.handle_autolink_with(node, autolink::url_match) } - pub fn handle_autolink_w(&mut self, node: &'a AstNode<'a>) -> Option<&'a AstNode<'a>> { + fn handle_autolink_w(&mut self, node: &'a AstNode<'a>) -> Option<&'a AstNode<'a>> { self.handle_autolink_with(node, autolink::www_match) } - pub fn handle_pointy_brace(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { + fn handle_pointy_brace(&mut self, parent_line_offsets: &[usize]) -> &'a AstNode<'a> { self.pos += 1; if let Some(matchlen) = scanners::autolink_uri(&self.input[self.pos..]) { @@ -1453,7 +1447,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.make_inline(NodeValue::Text("<".to_string()), self.pos - 1, self.pos - 1) } - pub fn push_bracket(&mut self, image: bool, inl_text: &'a AstNode<'a>) { + fn push_bracket(&mut self, image: bool, inl_text: &'a AstNode<'a>) { let len = self.brackets.len(); if len > 0 { self.brackets[len - 1].bracket_after = true; @@ -1469,7 +1463,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } } - pub fn handle_close_bracket(&mut self) -> Option<&'a AstNode<'a>> { + fn handle_close_bracket(&mut self) -> Option<&'a AstNode<'a>> { self.pos += 1; let initial_pos = self.pos; @@ -1690,7 +1684,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { Some(self.make_inline(NodeValue::Text("]".to_string()), self.pos - 1, self.pos - 1)) } - pub fn close_bracket_match(&mut self, is_image: bool, url: String, title: String) { + fn close_bracket_match(&mut self, is_image: bool, url: String, title: String) { let brackets_len = self.brackets.len(); let nl = NodeLink { url, title }; @@ -1771,7 +1765,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { // Handles wikilink syntax // [[link text|url]] // [[url|link text]] - pub fn handle_wikilink(&mut self) -> Option<&'a AstNode<'a>> { + fn handle_wikilink(&mut self) -> Option<&'a AstNode<'a>> { let startpos = self.pos; let component = self.wikilink_url_link_label()?; let url_clean = strings::clean_url(component.url); From ee42217a7d9cf6ac989c542331936e8d18cc10f5 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 14:03:20 +1100 Subject: [PATCH 14/30] tests: elaborate on a bunch of AST tests. Also test text content of links, images, frontmatter, and support testing both text and children (since many nodes have both). --- src/parser/autolink.rs | 27 +++++++---- src/tests.rs | 94 ++++++++++++++++++++++++++------------- src/tests/autolink.rs | 8 ++-- src/tests/core.rs | 16 +++---- src/tests/front_matter.rs | 78 +++++++++++++++----------------- src/tests/fuzz.rs | 29 +++++++----- 6 files changed, 149 insertions(+), 103 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index e9c8369b..b9bdd346 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -55,7 +55,23 @@ pub(crate) fn process_autolinks<'a>( if let Some((post, reverse, skip)) = post_org { i -= reverse; node.insert_after(post); - if i + skip < len { + + // Enormous HACK. This compensates for our lack of accounting for + // smart puncutation in sourcepos in exactly one extant regression + // test (tests::fuzz::trailing_hyphen_matches) and one fuzzer + // crash observed so far, but it is no substitute, probably fails + // miserably in other contexts, and we really instead just need an + // actual solution to that issue. + // + // Problems as follows: + // * We almost certainly need to measure `skip` the same way. + // * We're counting extremely arbitrary Unicode characters, which + // don't _necessarily_ map 1:1 with the transformation done by + // smart punctuation. + // * I've briefly forgotten _how_ this actually fixes anything. + let len_less_i = str::from_utf8(&contents[i..]).unwrap().chars().count(); + + if skip < len_less_i { let remain = str::from_utf8(&contents[i + skip..]).unwrap(); assert!(!remain.is_empty()); post.insert_after(make_inline( @@ -63,7 +79,7 @@ pub(crate) fn process_autolinks<'a>( NodeValue::Text(remain.to_string()), ( sourcepos.end.line, - sourcepos.end.column + 1 - (len - i - skip), + sourcepos.end.column + 1 - (len_less_i - skip), sourcepos.end.line, sourcepos.end.column, ) @@ -71,12 +87,7 @@ pub(crate) fn process_autolinks<'a>( )); } - // Enormous HACK. This compensates for our lack of accounting for - // smart puncutation in sourcepos in exactly one extant regression - // test (tests::fuzz::trailing_hyphen_matches), but it is no - // substitute, probably fails miserably in other contexts, and we - // really instead just need an actual solution to that issue. - sourcepos.end.column -= str::from_utf8(&contents[i..]).unwrap().chars().count(); + sourcepos.end.column -= len_less_i; contents_str.truncate(i); let nsp: Sourcepos = ( diff --git a/src/tests.rs b/src/tests.rs index 0c4cd950..2822e80d 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -289,17 +289,16 @@ macro_rules! sourcepos { pub(crate) use sourcepos; macro_rules! ast { - (($name:tt $sp:tt)) => { - ast!(($name $sp [])) - }; - (($name:tt $sp:tt $content:tt)) => { + (($name:tt $sp:tt $( $content:tt )*)) => { AstMatchTree { name: stringify!($name).to_string(), sourcepos: sourcepos!($sp), - content: ast!($content), + matches: vec![ $( ast_content!($content), )* ], } }; +} +macro_rules! ast_content { ($text:literal) => {AstMatchContent::Text($text.to_string())}; ([ $( $children:tt )* ]) => { AstMatchContent::Children(vec![ $( ast!($children), )* ]) @@ -307,6 +306,7 @@ macro_rules! ast { } pub(crate) use ast; +pub(crate) use ast_content; #[track_caller] fn assert_ast_match_i(md: &str, amt: AstMatchTree, opts: F) @@ -365,7 +365,7 @@ pub(crate) use assert_ast_match; struct AstMatchTree { name: String, sourcepos: Sourcepos, - content: AstMatchContent, + matches: Vec, } enum AstMatchContent { @@ -380,37 +380,71 @@ impl AstMatchTree { assert_eq!(self.name, ast.value.xml_node_name(), "node type matches"); assert_eq!(self.sourcepos, ast.sourcepos, "sourcepos are equal"); - match &self.content { - AstMatchContent::Text(text) => { - assert_eq!( - 0, - node.children().count(), - "text node should have no children" - ); - match ast.value { + let mut asserted_text = false; + let mut asserted_children = false; + + for m in &self.matches { + match m { + AstMatchContent::Text(text) => match ast.value { NodeValue::Math(ref nm) => { - assert_eq!(text, &nm.literal, "math node content should match"); + assert_eq!(text, &nm.literal, "Math literal should match"); + asserted_text = true; } NodeValue::CodeBlock(ref ncb) => { - assert_eq!(text, &ncb.literal, "code block node content should match") + assert_eq!(text, &ncb.literal, "CodeBlock literal should match"); + asserted_text = true; + } + NodeValue::Text(ref nt) => { + assert_eq!(text, nt, "Text content should match"); + asserted_text = true; + } + NodeValue::Link(ref nl) => { + assert_eq!(text, &nl.url, "Link destination should match"); + asserted_text = true; + } + NodeValue::Image(ref ni) => { + assert_eq!(text, &ni.url, "Image source should match"); + asserted_text = true; + } + NodeValue::FrontMatter(ref nfm) => { + assert_eq!(text, nfm, "Front matter content should match"); + asserted_text = true; } - _ => assert_eq!( - text, - ast.value.text().unwrap(), - "text node content should match" + _ => panic!( + "no text content matcher for this node type: {:?}", + ast.value ), - } - } - AstMatchContent::Children(children) => { - assert_eq!( - children.len(), - node.children().count(), - "children count should match" - ); - for (e, a) in children.iter().zip(node.children()) { - e.assert_match(a); + }, + AstMatchContent::Children(children) => { + assert_eq!( + children.len(), + node.children().count(), + "children count should match" + ); + for (e, a) in children.iter().zip(node.children()) { + e.assert_match(a); + } + asserted_children = true; } } } + + assert!( + asserted_children || node.children().count() == 0, + "children were not asserted" + ); + assert!( + asserted_text + || !matches!( + ast.value, + NodeValue::Math(_) + | NodeValue::CodeBlock(_) + | NodeValue::Text(_) + | NodeValue::Link(_) + | NodeValue::Image(_) + | NodeValue::FrontMatter(_) + ), + "text wasn't asserted" + ); } } diff --git a/src/tests/autolink.rs b/src/tests/autolink.rs index 36f524b8..9fb766cd 100644 --- a/src/tests/autolink.rs +++ b/src/tests/autolink.rs @@ -290,7 +290,7 @@ fn sourcepos_correctly_restores_context() { (text (1:5-1:7) "cde") ]) (text (1:9-1:9) " ") - (link (1:10-1:15) [ + (link (1:10-1:15) "mailto:f@g.ee" [ (text (1:10-1:15) "f@g.ee") ]) (text (1:16-1:17) " h") @@ -409,21 +409,21 @@ fn autolink_sourcepos() { (document (1:1-5:17) [ (paragraph (1:1-1:13) [ (text (1:1-1:3) "a ") - (link (1:4-1:10) [ + (link (1:4-1:10) "http://www.com" [ (text (1:4-1:10) "www.com") ]) (text (1:11-1:13) " x") ]) (paragraph (3:1-3:21) [ (text (3:1-3:3) "b ") - (link (3:4-3:18) [ + (link (3:4-3:18) "https://www.com" [ (text (3:4-3:18) "https://www.com") ]) (text (3:19-3:21) " y") ]) (paragraph (5:1-5:17) [ (text (5:1-5:3) "c ") - (link (5:4-5:14) [ + (link (5:4-5:14) "mailto:foo@www.com" [ (text (5:4-5:14) "foo@www.com") ]) (text (5:15-5:17) " z") diff --git a/src/tests/core.rs b/src/tests/core.rs index 714ced6b..d8df80f9 100644 --- a/src/tests/core.rs +++ b/src/tests/core.rs @@ -523,7 +523,7 @@ fn link_sourcepos_baseline() { "[ABCD](/)\n", (document (1:1-1:9) [ (paragraph (1:1-1:9) [ - (link (1:1-1:9) [ + (link (1:1-1:9) "/" [ (text (1:2-1:5) "ABCD") ]) ]) @@ -539,7 +539,7 @@ fn link_sourcepos_newline() { "[AB\nCD](/)\n", (document (1:1-2:6) [ (paragraph (1:1-2:6) [ - (link (1:1-2:6) [ + (link (1:1-2:6) "/" [ (text (1:2-1:3) "AB") (softbreak (1:4-1:4)) (text (2:1-2:2) "CD") @@ -560,8 +560,8 @@ fn link_sourcepos_truffle() { (paragraph (1:3-2:18) [ (text (1:3-1:3) "A") (softbreak (1:4-1:4)) - (link (2:1-2:18) [ - (image (2:2-2:13) [ + (link (2:1-2:18) "/B" [ + (image (2:2-2:13) "/B.png" [ (text (2:4-2:4) "B") ]) ]) @@ -583,8 +583,8 @@ fn link_sourcepos_truffle_twist() { (paragraph (1:3-2:20) [ (text (1:3-1:3) "A") (softbreak (1:4-1:4)) - (link (2:3-2:20) [ - (image (2:4-2:15) [ + (link (2:3-2:20) "/B" [ + (image (2:4-2:15) "/B.png" [ (text (2:6-2:6) "B") ]) ]) @@ -606,8 +606,8 @@ fn link_sourcepos_truffle_bergamot() { (paragraph (1:3-2:21) [ (text (1:3-1:3) "A") (softbreak (1:4-1:4)) - (link (2:4-2:21) [ - (image (2:5-2:16) [ + (link (2:4-2:21) "/B" [ + (image (2:5-2:16) "/B.png" [ (text (2:7-2:7) "B") ]) ]) diff --git a/src/tests/front_matter.rs b/src/tests/front_matter.rs index 60580eb5..548f7ab2 100644 --- a/src/tests/front_matter.rs +++ b/src/tests/front_matter.rs @@ -26,72 +26,64 @@ fn round_trip_wide_delimiter() { assert_eq!(&String::from_utf8(buf).unwrap(), input); } -#[test] -fn ast_wide_delimiter() { - let input = "\u{04fc}\nlayout: post\n\u{04fc}\nText\n"; - - assert_ast_match_i( - input, - ast!((document (1:1-4:4) [ - (frontmatter (1:1-3:2) []) - (paragraph (4:1-4:4) [ - (text (4:1-4:4) []) - ]) - ])), - |opts| opts.extension.front_matter_delimiter = Some("\u{04fc}".to_owned()), - ); -} - #[test] fn ast() { - let input = "q\nlayout: post\nq\nText\n"; - - assert_ast_match_i( - input, - ast!((document (1:1-4:4) [ - (frontmatter (1:1-3:1) []) + assert_ast_match!( + [extension.front_matter_delimiter = Some("q".to_owned())], + "q\nlayout: post\nq\nText\n", + (document (1:1-4:4) [ + (frontmatter (1:1-3:1) "q\nlayout: post\nq\n") (paragraph (4:1-4:4) [ - (text (4:1-4:4) []) + (text (4:1-4:4) "Text") ]) - ])), - |opts| opts.extension.front_matter_delimiter = Some("q".to_owned()), + ]) ); } #[test] fn ast_blank_line() { - let input = r#"--- + assert_ast_match!( + [extension.front_matter_delimiter = Some("---".to_owned())], + r#"--- a: b --- hello world -"#; - - assert_ast_match_i( - input, - ast!((document (1:1-5:11) [ - (frontmatter (1:1-3:3) []) +"#, + (document (1:1-5:11) [ + (frontmatter (1:1-3:3) "---\na: b\n---\n\n") (paragraph (5:1-5:11) [ - (text (5:1-5:11) []) + (text (5:1-5:11) "hello world") ]) - ])), - |opts| opts.extension.front_matter_delimiter = Some("---".to_owned()), + ]) ); } #[test] fn ast_carriage_return() { - let input = "q\r\nlayout: post\r\nq\r\nText\r\n"; + assert_ast_match!( + [extension.front_matter_delimiter = Some("q".to_owned())], + "q\r\nlayout: post\r\nq\r\nText\r\n", + (document (1:1-4:4) [ + (frontmatter (1:1-3:1) "q\r\nlayout: post\r\nq\r\n") + (paragraph (4:1-4:4) [ + (text (4:1-4:4) "Text") + ]) + ]) + ); +} - assert_ast_match_i( - input, - ast!((document (1:1-4:4) [ - (frontmatter (1:1-3:1) []) +#[test] +fn ast_wide_delimiter() { + assert_ast_match!( + [extension.front_matter_delimiter = Some("\u{04fc}".to_owned())], + "\u{04fc}\nlayout: post\n\u{04fc}\nText\n", + (document (1:1-4:4) [ + (frontmatter (1:1-3:2) "\u{04fc}\nlayout: post\n\u{04fc}\n") (paragraph (4:1-4:4) [ - (text (4:1-4:4) []) + (text (4:1-4:4) "Text") ]) - ])), - |opts| opts.extension.front_matter_delimiter = Some("q".to_owned()), + ]) ); } diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 5be56726..bc7a58a6 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -1,4 +1,4 @@ -use super::{html, html_opts}; +use super::*; #[test] fn pointy_brace_open() { @@ -72,21 +72,30 @@ fn bracket_match() { #[test] fn trailing_hyphen() { - html_opts!( - [extension.autolink, parse.smart, render.sourcepos], + assert_ast_match!( + [extension.autolink, parse.smart], "3@.l-", - "

3@.l-

\n" + (document (1:1-1:5) [ + (paragraph (1:1-1:5) [ + (text (1:1-1:5) "3@.l-") + ]) + ]) ); } +#[ignore] #[test] fn trailing_hyphen_matches() { - html_opts!( - [extension.autolink, parse.smart, render.sourcepos], + assert_ast_match!( + [extension.autolink, parse.smart], "3@.l--", - "

3@.l

\n", - no_roundtrip // We serialise the link back to <3@.l>, which doesn't - // parse as a classic autolink, but the email inside the - // <...> does, meaning the get rendered! + (document (1:1-1:6) [ + (paragraph (1:1-1:6) [ + (link (1:1-1:4) [ + (text (1:1-1:4) "3@.l") + ]) + (text (1:5-1:6) "–") + ]) + ]) ); } From 33363de759e414d0ed87551145f15385155e03a7 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 14:43:45 +1100 Subject: [PATCH 15/30] autolink: fix up email sourcepos in presence of smart punctuation. --- src/parser/autolink.rs | 94 ++++++++++++++++++++++++++++-------------- src/parser/mod.rs | 47 ++++++++++----------- src/tests/autolink.rs | 23 ----------- src/tests/fuzz.rs | 58 +++++++++++++++++++++----- 4 files changed, 134 insertions(+), 88 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index b9bdd346..dd9ecabe 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -9,12 +9,13 @@ use unicode_categories::UnicodeCategories; // TODO: this can probably be cleaned up a lot. It used to handle all three of // {url,www,email}_match, but now just the last of those. (This is still per // upstream cmark-gfm, so it's not easily changed without breaking compat.) -pub(crate) fn process_autolinks<'a>( +pub(crate) fn process_email_autolinks<'a>( arena: &'a Arena>, node: &'a AstNode<'a>, contents_str: &mut String, relaxed_autolinks: bool, sourcepos: &mut Sourcepos, + spx: &[(Sourcepos, usize)], ) { let contents = contents_str.as_bytes(); let len = contents.len(); @@ -56,38 +57,56 @@ pub(crate) fn process_autolinks<'a>( i -= reverse; node.insert_after(post); - // Enormous HACK. This compensates for our lack of accounting for - // smart puncutation in sourcepos in exactly one extant regression - // test (tests::fuzz::trailing_hyphen_matches) and one fuzzer - // crash observed so far, but it is no substitute, probably fails - // miserably in other contexts, and we really instead just need an - // actual solution to that issue. - // - // Problems as follows: - // * We almost certainly need to measure `skip` the same way. - // * We're counting extremely arbitrary Unicode characters, which - // don't _necessarily_ map 1:1 with the transformation done by - // smart punctuation. - // * I've briefly forgotten _how_ this actually fixes anything. - let len_less_i = str::from_utf8(&contents[i..]).unwrap().chars().count(); - - if skip < len_less_i { + let remain = if i + skip < len { let remain = str::from_utf8(&contents[i + skip..]).unwrap(); assert!(!remain.is_empty()); - post.insert_after(make_inline( - arena, - NodeValue::Text(remain.to_string()), - ( - sourcepos.end.line, - sourcepos.end.column + 1 - (len_less_i - skip), - sourcepos.end.line, - sourcepos.end.column, - ) - .into(), - )); - } + Some(remain.to_string()) + } else { + None + }; + let initial_end_col = sourcepos.end.column; - sourcepos.end.column -= len_less_i; + // Sourcepos end column `e` of the original node (set by writing + // to `*sourcepos`) determined by advancing through `spx` until `i` + // bytes of input are seen. + // + // For each element `(sp, x)` in `spx`: + // - if remaining `i` is greater than the byte count `x`, + // set `i -= x` and continue. + // - if remaining `i` is equal to the byte count `x`, + // set `e = sp.end.column - 1` and finish. + // - if remaining `i` is less than the byte count `x`, + // assert `sp.end.column - sp.start.column + 1 == x` (1), + // set `e = sp.start.column + i - 1` and finish. + // + // (1) If `x` doesn't equal the range covered between the start + // and end column, there's no way to determine sourcepos within + // the range. This is a bug if it happens; it suggests we've + // matched an email autolink with some smart punctuation in it, + // or worse. + // + // NOTE: a little iffy on the way I've added `- 1` --- what we + // calculate here technically is the start column of the linked + // portion, then adjusted. I think this should be robust, but needs + // checking at edges. + + let mut rem_i = i; + for &(sp, x) in spx { + if rem_i > x { + rem_i -= x; + } else if rem_i == x { + sourcepos.end.column = sp.end.column - 1; + rem_i = 0; + break; + } else { + // rem_i < x + assert_eq!(sp.end.column - sp.start.column + 1, x); + sourcepos.end.column = sp.start.column + rem_i - 1; + rem_i = 0; + break; + } + } + assert!(rem_i == 0); contents_str.truncate(i); let nsp: Sourcepos = ( @@ -101,6 +120,21 @@ pub(crate) fn process_autolinks<'a>( // Inner text gets same sourcepos as link, since there's nothing but // the text. post.first_child().unwrap().data.borrow_mut().sourcepos = nsp; + + if let Some(remain) = remain { + post.insert_after(make_inline( + arena, + NodeValue::Text(remain.to_string()), + ( + sourcepos.end.line, + nsp.end.column + 1, + sourcepos.end.line, + initial_end_col, + ) + .into(), + )); + } + return; } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 1b724866..743a88b3 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2955,38 +2955,33 @@ where let n_ast = &mut n.data.borrow_mut(); let mut sourcepos = n_ast.sourcepos; - loop { - match n_ast.value { - // Join adjacent text nodes together - NodeValue::Text(ref mut root) => { - let ns = match n.next_sibling() { - Some(ns) => ns, - _ => { - // Post-process once we are finished joining text nodes - self.postprocess_text_node(n, root, &mut sourcepos); - break; - } - }; - + match n_ast.value { + NodeValue::Text(ref mut root) => { + // Join adjacent text nodes together, then post-process. + // Record the original list of sourcepos and bytecounts + // for the post-processing step. + let mut spx = vec![(sourcepos, root.len())]; + while let Some(ns) = n.next_sibling() { match ns.data.borrow().value { NodeValue::Text(ref adj) => { root.push_str(adj); - sourcepos.end.column = ns.data.borrow().sourcepos.end.column; + let sp = ns.data.borrow().sourcepos; + spx.push((sp, adj.len())); + sourcepos.end.column = sp.end.column; ns.detach(); } - _ => { - // Post-process once we are finished joining text nodes - self.postprocess_text_node(n, root, &mut sourcepos); - break; - } + _ => break, } } - NodeValue::Link(..) | NodeValue::Image(..) | NodeValue::WikiLink(..) => { - this_bracket = true; - break; - } - _ => break, + + self.postprocess_text_node(n, root, &mut sourcepos, &spx); + } + NodeValue::Link(..) | NodeValue::Image(..) | NodeValue::WikiLink(..) => { + // Don't recurse into links (no links-within-links) or + // images (title part). + this_bracket = true; } + _ => {} } n_ast.sourcepos = sourcepos; @@ -3009,18 +3004,20 @@ where node: &'a AstNode<'a>, text: &mut String, sourcepos: &mut Sourcepos, + spx: &[(Sourcepos, usize)], ) { if self.options.extension.tasklist { self.process_tasklist(node, text, sourcepos); } if self.options.extension.autolink { - autolink::process_autolinks( + autolink::process_email_autolinks( self.arena, node, text, self.options.parse.relaxed_autolinks, sourcepos, + spx, ); } } diff --git a/src/tests/autolink.rs b/src/tests/autolink.rs index 9fb766cd..1b01a9c0 100644 --- a/src/tests/autolink.rs +++ b/src/tests/autolink.rs @@ -239,29 +239,6 @@ fn autolink_relaxed_links_schemes() { #[test] fn sourcepos_correctly_restores_context() { - // There's unsoundness in trying to maintain and adjust sourcepos - // when doing autolinks in the light of: - // - // a) Some source elements introducing a different number of characters - // to the content text than they take in source, i.e. smart - // punctuation. - // - // b) Text node consolidation happening before autolinking. - // - // (b) is obviously non-optional, but it means we end up with Text - // nodes with different byte counts than their sourcepos span lengths. - // - // One possible solution would be to actually accumulate multiple - // sourcepos spans per Text node, each also tracking the number of - // bytes of content text it's responsible for. This would work well - // enough as long as we never had to adjust a sourcepos into a spot - // within a sourcepos span that had a target text width where it - // wasn't equal. That probably wouldn't happen, though -- i.e. we're - // never autolinking into the middle of a rendered smart punctuation. - // - // The below documents test cases that used to break; I suspect smart - // punctuation is still a breaking case however! - assert_ast_match!( [], "ab _cde_ f@g.ee h*ijklm* n", diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index bc7a58a6..9be71e54 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -83,19 +83,57 @@ fn trailing_hyphen() { ); } -#[ignore] #[test] fn trailing_hyphen_matches() { + // TODO: repeat below without smart, direct entry + // TODO: "\ at EOL" style break breaks sourcepos + // TODO: check no empty text node before autolink at start assert_ast_match!( [extension.autolink, parse.smart], - "3@.l--", - (document (1:1-1:6) [ - (paragraph (1:1-1:6) [ - (link (1:1-1:4) [ - (text (1:1-1:4) "3@.l") - ]) - (text (1:5-1:6) "–") - ]) - ]) + "--\n" + "--(3@.l--\n", + (document (1:1-2:9) [ + (paragraph (1:1-2:9) [ + (text (1:1-1:2) "–") // en-dash + (softbreak (1:3-1:3)) + (text (2:1-2:3) "–(") // en-dash + (link (2:4-2:7) "mailto:3@.l" [ + (text (2:4-2:7) "3@.l") + ]) + (text (2:8-2:9) "–") // en-dash + ]) + ]) + ); +} + +#[test] +fn smart_sourcepos() { + assert_ast_match!( + [parse.smart], + ": _--_ **---**\n\n" + // As above, but entered directly. + ": _–_ **—**\n", + (document (1:1-3:15) [ + (paragraph (1:1-1:14) [ + (text (1:1-1:2) ": ") + (emph (1:3-1:6) [ + (text (1:4-1:5) "–") // en-dash + ]) + (text (1:7-1:7) " ") + (strong (1:8-1:14) [ + (text (1:10-1:12) "—") // em-dash + ]) + ]) + (paragraph (3:1-3:15) [ + (text (3:1-3:2) ": ") + (emph (3:3-3:7) [ + (text (3:4-3:6) "–") // en-dash; 3 bytes in input + ]) + (text (3:8-3:8) " ") + (strong (3:9-3:15) [ + (text (3:11-3:13) "—") // em-dash; (still) 3 bytes + ]) + ]) + ]) ); } From 21eae3012f7a12e484e93e945614fb9cac2ab324 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 18:17:55 +1100 Subject: [PATCH 16/30] parser: remove empty node left over by postprocessing. --- src/parser/mod.rs | 8 +++++++- src/tests/autolink.rs | 19 +++++++++++++++++++ src/tests/fuzz.rs | 39 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 743a88b3..b0a81c52 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2952,6 +2952,7 @@ where while let Some(n) = nch { let mut this_bracket = false; + let mut emptied = false; let n_ast = &mut n.data.borrow_mut(); let mut sourcepos = n_ast.sourcepos; @@ -2975,6 +2976,7 @@ where } self.postprocess_text_node(n, root, &mut sourcepos, &spx); + emptied = root.len() == 0; } NodeValue::Link(..) | NodeValue::Image(..) | NodeValue::WikiLink(..) => { // Don't recurse into links (no links-within-links) or @@ -2986,11 +2988,15 @@ where n_ast.sourcepos = sourcepos; - if !this_bracket { + if !this_bracket && !emptied { children.push(n); } nch = n.next_sibling(); + + if emptied { + n.detach(); + } } // Push children onto work stack in reverse order so they are diff --git a/src/tests/autolink.rs b/src/tests/autolink.rs index 1b01a9c0..7f3c1f6b 100644 --- a/src/tests/autolink.rs +++ b/src/tests/autolink.rs @@ -408,3 +408,22 @@ fn autolink_sourcepos() { ]) ); } + +#[test] +fn autolink_consecutive_email() { + assert_ast_match!( + [extension.autolink], + "scyther@pokemon.com/beedrill@pokemon.com", + (document (1:1-1:40) [ + (paragraph (1:1-1:40) [ + (link (1:1-1:19) "mailto:scyther@pokemon.com" [ + (text (1:1-1:19) "scyther@pokemon.com") + ]) + (text (1:20-1:20) "/") + (link (1:21-1:40) "mailto:beedrill@pokemon.com" [ + (text (1:21-1:40) "beedrill@pokemon.com") + ]) + ]) + ]) + ); +} diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 9be71e54..eabd5b5a 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -84,10 +84,8 @@ fn trailing_hyphen() { } #[test] -fn trailing_hyphen_matches() { - // TODO: repeat below without smart, direct entry +fn trailing_smart_endash_matches() { // TODO: "\ at EOL" style break breaks sourcepos - // TODO: check no empty text node before autolink at start assert_ast_match!( [extension.autolink, parse.smart], "--\n" @@ -106,6 +104,41 @@ fn trailing_hyphen_matches() { ); } +#[test] +fn trailing_endash_matches() { + assert_ast_match!( + [extension.autolink], + "–\n" + "–(3@.l–\n", + (document (1:1-2:11) [ + (paragraph (1:1-2:11) [ + (text (1:1-1:3) "–") // en-dash + (softbreak (1:4-1:4)) + (text (2:1-2:4) "–(") // en-dash + (link (2:5-2:8) "mailto:3@.l" [ + (text (2:5-2:8) "3@.l") + ]) + (text (2:9-2:11) "–") // en-dash + ]) + ]) + ); +} + +#[test] +fn no_empty_text_before_email() { + assert_ast_match!( + [extension.autolink], + "a@b.c\n", + (document (1:1-1:5) [ + (paragraph (1:1-1:5) [ + (link (1:1-1:5) "mailto:a@b.c" [ + (text (1:1-1:5) "a@b.c") + ]) + ]) + ]) + ); +} + #[test] fn smart_sourcepos() { assert_ast_match!( From 6849733bec144c7c2bbdece856e3a782bc7ec7c4 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 18:23:25 +1100 Subject: [PATCH 17/30] inlines: fix LineBreak sourcepos from backslash. --- src/parser/inlines.rs | 6 +++++- src/tests/fuzz.rs | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 25df673c..2fabba57 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -1208,7 +1208,11 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { inline_text } } else if !self.eof() && self.skip_line_end() { - self.make_inline(NodeValue::LineBreak, startpos, self.pos - 1) + let inl = self.make_inline(NodeValue::LineBreak, startpos, self.pos - 1); + self.line += 1; + self.column_offset = -(self.pos as isize); + self.skip_spaces(); + inl } else { self.make_inline( NodeValue::Text("\\".to_string()), diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index eabd5b5a..94fa4ba8 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -170,3 +170,19 @@ fn smart_sourcepos() { ]) ); } + +#[test] +fn linebreak_sourcepos() { + assert_ast_match!( + [], + "a\\\n" + "b\n", + (document (1:1-2:1) [ + (paragraph (1:1-2:1) [ + (text (1:1-1:1) "a") + (linebreak (1:2-1:3)) + (text (2:1-2:1) "b") + ]) + ]) + ); +} From 4ed12c7f097707ce6c193b8789bd7a8528aa5c8c Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 18:33:36 +1100 Subject: [PATCH 18/30] autolink: correct "=" case. --- flake.nix | 2 ++ src/parser/autolink.rs | 33 ++++++++++----------- src/parser/inlines.rs | 67 +++++++++++++++++++----------------------- src/tests/fuzz.rs | 20 +++++++++++++ 4 files changed, 68 insertions(+), 54 deletions(-) diff --git a/flake.nix b/flake.nix index 4a1d797f..7275de65 100644 --- a/flake.nix +++ b/flake.nix @@ -114,6 +114,8 @@ formatter = pkgs.alejandra; devShells.default = pkgs.mkShell { + name = "comrak"; + inputsFrom = builtins.attrValues self.checks.${system}; nativeBuildInputs = [ diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index dd9ecabe..b9e1b22d 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -74,10 +74,10 @@ pub(crate) fn process_email_autolinks<'a>( // - if remaining `i` is greater than the byte count `x`, // set `i -= x` and continue. // - if remaining `i` is equal to the byte count `x`, - // set `e = sp.end.column - 1` and finish. + // set `e = sp.end.column` and finish. // - if remaining `i` is less than the byte count `x`, // assert `sp.end.column - sp.start.column + 1 == x` (1), - // set `e = sp.start.column + i - 1` and finish. + // set `e = sp.start.column + i - 1` (2) and finish. // // (1) If `x` doesn't equal the range covered between the start // and end column, there's no way to determine sourcepos within @@ -85,17 +85,17 @@ pub(crate) fn process_email_autolinks<'a>( // matched an email autolink with some smart punctuation in it, // or worse. // - // NOTE: a little iffy on the way I've added `- 1` --- what we - // calculate here technically is the start column of the linked - // portion, then adjusted. I think this should be robust, but needs - // checking at edges. + // (2) A little iffy on the way I've added `- 1` --- what we + // calculate here technically is the start column of the linked + // portion, then adjusted. I think this should be robust, but + // needs checking at edges. let mut rem_i = i; for &(sp, x) in spx { if rem_i > x { rem_i -= x; } else if rem_i == x { - sourcepos.end.column = sp.end.column - 1; + sourcepos.end.column = sp.end.column; rem_i = 0; break; } else { @@ -122,17 +122,14 @@ pub(crate) fn process_email_autolinks<'a>( post.first_child().unwrap().data.borrow_mut().sourcepos = nsp; if let Some(remain) = remain { - post.insert_after(make_inline( - arena, - NodeValue::Text(remain.to_string()), - ( - sourcepos.end.line, - nsp.end.column + 1, - sourcepos.end.line, - initial_end_col, - ) - .into(), - )); + let asp: Sourcepos = ( + sourcepos.end.line, + nsp.end.column + 1, + sourcepos.end.line, + initial_end_col, + ) + .into(); + post.insert_after(make_inline(arena, NodeValue::Text(remain.to_string()), asp)); } return; diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 2fabba57..29c4e3fa 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -811,43 +811,38 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } else { self.scan_to_closing_dollar(opendollars) } - .filter(|epos| epos - startpos >= fence_length * 2 + 1); + .filter(|endpos| endpos - startpos >= fence_length * 2 + 1); - match endpos { - None => { - if code_math { - self.pos = startpos + 1; - self.make_inline(NodeValue::Text("$".to_string()), self.pos - 1, self.pos - 1) - } else { - self.pos = startpos + fence_length; - self.make_inline( - NodeValue::Text("$".repeat(opendollars)), - self.pos - fence_length, - self.pos - 1, - ) - } - } - Some(endpos) => { - let buf = &self.input[startpos + fence_length..endpos - fence_length]; - let buf: Vec = if code_math || opendollars == 1 { - strings::normalize_code(buf) - } else { - buf.to_vec() - }; - let math = NodeMath { - dollar_math: !code_math, - display_math: opendollars == 2, - literal: String::from_utf8(buf).unwrap(), - }; - let node = self.make_inline(NodeValue::Math(math), startpos, endpos - 1); - self.adjust_node_newlines( - node, - endpos - startpos - fence_length, - fence_length, - parent_line_offsets, - ); - node - } + if let Some(endpos) = endpos { + let buf = &self.input[startpos + fence_length..endpos - fence_length]; + let buf: Vec = if code_math || opendollars == 1 { + strings::normalize_code(buf) + } else { + buf.to_vec() + }; + let math = NodeMath { + dollar_math: !code_math, + display_math: opendollars == 2, + literal: String::from_utf8(buf).unwrap(), + }; + let node = self.make_inline(NodeValue::Math(math), startpos, endpos - 1); + self.adjust_node_newlines( + node, + endpos - startpos - fence_length, + fence_length, + parent_line_offsets, + ); + node + } else if code_math { + self.pos = startpos + 1; + self.make_inline(NodeValue::Text("$".to_string()), self.pos - 1, self.pos - 1) + } else { + self.pos = startpos + fence_length; + self.make_inline( + NodeValue::Text("$".repeat(opendollars)), + self.pos - fence_length, + self.pos - 1, + ) } } diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 94fa4ba8..6aa6d9fe 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -186,3 +186,23 @@ fn linebreak_sourcepos() { ]) ); } + +#[test] +fn echaw() { + assert_ast_match!( + [extension.autolink], + " Date: Thu, 27 Feb 2025 18:33:36 +1100 Subject: [PATCH 19/30] autolink: deal with consecutive replacements. --- src/parser/autolink.rs | 97 +++++++++++++++++++++++++++++++--------- src/parser/mod.rs | 11 ++--- src/tests/autolink.rs | 19 ++++++++ src/tests/fuzz.rs | 20 +++++++++ src/tests/regressions.rs | 62 +++++++++++++++++++++++++ 5 files changed, 183 insertions(+), 26 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index b9e1b22d..52de43b6 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -2,20 +2,18 @@ use crate::character_set::character_set; use crate::ctype::{isalnum, isalpha, isspace}; use crate::nodes::{AstNode, NodeLink, NodeValue, Sourcepos}; use crate::parser::inlines::make_inline; +use std::collections::VecDeque; use std::str; use typed_arena::Arena; use unicode_categories::UnicodeCategories; -// TODO: this can probably be cleaned up a lot. It used to handle all three of -// {url,www,email}_match, but now just the last of those. (This is still per -// upstream cmark-gfm, so it's not easily changed without breaking compat.) pub(crate) fn process_email_autolinks<'a>( arena: &'a Arena>, node: &'a AstNode<'a>, contents_str: &mut String, relaxed_autolinks: bool, sourcepos: &mut Sourcepos, - spx: &[(Sourcepos, usize)], + mut spx: VecDeque<(Sourcepos, usize)>, ) { let contents = contents_str.as_bytes(); let len = contents.len(); @@ -90,23 +88,37 @@ pub(crate) fn process_email_autolinks<'a>( // portion, then adjusted. I think this should be robust, but // needs checking at edges. - let mut rem_i = i; - for &(sp, x) in spx { - if rem_i > x { - rem_i -= x; - } else if rem_i == x { - sourcepos.end.column = sp.end.column; - rem_i = 0; - break; - } else { - // rem_i < x - assert_eq!(sp.end.column - sp.start.column + 1, x); - sourcepos.end.column = sp.start.column + rem_i - 1; - rem_i = 0; - break; + if i > 0 { + let mut rem_i = i; + while let Some((sp, x)) = spx.pop_front() { + if rem_i > x { + rem_i -= x; + } else if rem_i == x { + sourcepos.end.column = sp.end.column; + rem_i = 0; + break; + } else { + // rem_i < x + assert_eq!(sp.end.column - sp.start.column + 1, x); + sourcepos.end.column = sp.start.column + rem_i - 1; + spx.push_front(( + ( + sp.start.line, + sp.start.column + rem_i, + sp.end.line, + sp.end.column, + ) + .into(), + x - rem_i, + )); + rem_i = 0; + break; + } } + assert!(rem_i == 0); + } else { + sourcepos.end.column = 0; } - assert!(rem_i == 0); contents_str.truncate(i); let nsp: Sourcepos = ( @@ -122,14 +134,57 @@ pub(crate) fn process_email_autolinks<'a>( post.first_child().unwrap().data.borrow_mut().sourcepos = nsp; if let Some(remain) = remain { - let asp: Sourcepos = ( + let mut asp: Sourcepos = ( sourcepos.end.line, nsp.end.column + 1, sourcepos.end.line, initial_end_col, ) .into(); - post.insert_after(make_inline(arena, NodeValue::Text(remain.to_string()), asp)); + + // Consume `skip` from spx. + let mut rem_skip = skip; + while let Some((sp, x)) = spx.pop_front() { + if rem_skip > x { + rem_skip -= x; + } else if rem_skip == x { + rem_skip = 0; + break; + } else { + // rem_skip < x + assert_eq!(sp.end.column - sp.start.column + 1, x); + spx.push_front(( + ( + sp.start.line, + sp.start.column + rem_skip, + sp.end.line, + sp.end.column, + ) + .into(), + x - rem_skip, + )); + rem_skip = 0; + break; + } + } + assert!(rem_skip == 0); + + let after = make_inline(arena, NodeValue::Text(remain.to_string()), asp); + post.insert_after(after); + + let after_ast = &mut after.data.borrow_mut(); + process_email_autolinks( + arena, + after, + match after_ast.value { + NodeValue::Text(ref mut t) => t, + _ => unreachable!(), + }, + relaxed_autolinks, + &mut asp, + spx, + ); + after_ast.sourcepos = asp; } return; diff --git a/src/parser/mod.rs b/src/parser/mod.rs index b0a81c52..188ff2a8 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -21,7 +21,7 @@ use crate::scanners::{self, SetextChar}; use crate::strings::{self, split_off_front_matter, Case}; use std::cell::RefCell; use std::cmp::min; -use std::collections::HashMap; +use std::collections::{HashMap, VecDeque}; use std::fmt::{self, Debug, Formatter}; use std::mem; use std::panic::RefUnwindSafe; @@ -2961,13 +2961,14 @@ where // Join adjacent text nodes together, then post-process. // Record the original list of sourcepos and bytecounts // for the post-processing step. - let mut spx = vec![(sourcepos, root.len())]; + let mut spx = VecDeque::new(); + spx.push_back((sourcepos, root.len())); while let Some(ns) = n.next_sibling() { match ns.data.borrow().value { NodeValue::Text(ref adj) => { root.push_str(adj); let sp = ns.data.borrow().sourcepos; - spx.push((sp, adj.len())); + spx.push_back((sp, adj.len())); sourcepos.end.column = sp.end.column; ns.detach(); } @@ -2975,7 +2976,7 @@ where } } - self.postprocess_text_node(n, root, &mut sourcepos, &spx); + self.postprocess_text_node(n, root, &mut sourcepos, spx); emptied = root.len() == 0; } NodeValue::Link(..) | NodeValue::Image(..) | NodeValue::WikiLink(..) => { @@ -3010,7 +3011,7 @@ where node: &'a AstNode<'a>, text: &mut String, sourcepos: &mut Sourcepos, - spx: &[(Sourcepos, usize)], + spx: VecDeque<(Sourcepos, usize)>, ) { if self.options.extension.tasklist { self.process_tasklist(node, text, sourcepos); diff --git a/src/tests/autolink.rs b/src/tests/autolink.rs index 7f3c1f6b..997a5d51 100644 --- a/src/tests/autolink.rs +++ b/src/tests/autolink.rs @@ -427,3 +427,22 @@ fn autolink_consecutive_email() { ]) ); } + +#[test] +fn autolink_consecutive_email_smart() { + assert_ast_match!( + [extension.autolink, parse.smart], + "scyther@pokemon.com--beedrill@pokemon.com", + (document (1:1-1:41) [ + (paragraph (1:1-1:41) [ + (link (1:1-1:19) "mailto:scyther@pokemon.com" [ + (text (1:1-1:19) "scyther@pokemon.com") + ]) + (text (1:20-1:21) "–") // en-dash + (link (1:22-1:41) "mailto:beedrill@pokemon.com" [ + (text (1:22-1:41) "beedrill@pokemon.com") + ]) + ]) + ]) + ); +} diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 6aa6d9fe..10601e88 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -206,3 +206,23 @@ fn echaw() { ]) ); } + +#[test] +fn echaw2() { + assert_ast_match!( + [extension.autolink, parse.smart], + ":C@.t'C@.t", + (document (1:1-1:10) [ + (paragraph (1:1-1:10) [ + (text (1:1-1:1) ":") + (link (1:2-1:5) "mailto:C@.t" [ + (text (1:2-1:5) "C@.t") + ]) + (text (1:6-1:6) "’") + (link (1:7-1:10) "mailto:C@.t" [ + (text (1:7-1:10) "C@.t") + ]) + ]) + ]) + ); +} diff --git a/src/tests/regressions.rs b/src/tests/regressions.rs index 8d54a217..5e98a404 100644 --- a/src/tests/regressions.rs +++ b/src/tests/regressions.rs @@ -177,3 +177,65 @@ fn sourcepos_link_items() { ]) ); } + +#[test] +fn assorted_links() { + assert_ast_match!( + [extension.autolink], + r#"hello world +hello [foo](https://example.com) world +hello [foo] world +hello [bar][bar] world +hello https://example.com/foo world +hello www.example.com world +hello foo@example.com world + +[foo]: https://example.com +[bar]: https://example.com"#, + (document (1:1-10:26) [ + (paragraph (1:1-7:27) [ + (text (1:1-1:6) "hello ") + (link (1:7-1:32) "https://example.com/fooo" [ + (text (1:8-1:31) "https://example.com/fooo") + ]) + (text (1:33-1:38) " world") + (softbreak (1:39-1:39)) + (text (2:1-2:6) "hello ") + (link (2:7-2:32) "https://example.com" [ + (text (2:8-2:10) "foo") + ]) + (text (2:33-2:38) " world") + (softbreak (2:39-2:39)) + (text (3:1-3:6) "hello ") + (link (3:7-3:11) "https://example.com" [ + (text (3:8-3:10) "foo") + ]) + (text (3:12-3:17) " world") + (softbreak (3:18-3:18)) + (text (4:1-4:6) "hello ") + (link (4:7-4:16) "https://example.com" [ + (text (4:8-4:10) "bar") + ]) + (text (4:17-4:22) " world") + (softbreak (4:23-4:23)) + (text (5:1-5:6) "hello ") + (link (5:7-5:29) "https://example.com/foo" [ + (text (5:7-5:29) "https://example.com/foo") + ]) + (text (5:30-5:35) " world") + (softbreak (5:36-5:36)) + (text (6:1-6:6) "hello ") + (link (6:7-6:21) "http://www.example.com" [ + (text (6:7-6:21) "www.example.com") + ]) + (text (6:22-6:27) " world") + (softbreak (6:28-6:28)) + (text (7:1-7:6) "hello ") + (link (7:7-7:21) "mailto:foo@example.com" [ + (text (7:7-7:21) "foo@example.com") + ]) + (text (7:22-7:27) " world") + ]) + ]) + ); +} From 1d030cf0900af1150300abe75d729736946868f8 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 20:41:06 +1100 Subject: [PATCH 20/30] autolink: refactor consume_spx. --- src/parser/autolink.rs | 132 +++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 83 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index 52de43b6..4f6a0b98 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -64,61 +64,7 @@ pub(crate) fn process_email_autolinks<'a>( }; let initial_end_col = sourcepos.end.column; - // Sourcepos end column `e` of the original node (set by writing - // to `*sourcepos`) determined by advancing through `spx` until `i` - // bytes of input are seen. - // - // For each element `(sp, x)` in `spx`: - // - if remaining `i` is greater than the byte count `x`, - // set `i -= x` and continue. - // - if remaining `i` is equal to the byte count `x`, - // set `e = sp.end.column` and finish. - // - if remaining `i` is less than the byte count `x`, - // assert `sp.end.column - sp.start.column + 1 == x` (1), - // set `e = sp.start.column + i - 1` (2) and finish. - // - // (1) If `x` doesn't equal the range covered between the start - // and end column, there's no way to determine sourcepos within - // the range. This is a bug if it happens; it suggests we've - // matched an email autolink with some smart punctuation in it, - // or worse. - // - // (2) A little iffy on the way I've added `- 1` --- what we - // calculate here technically is the start column of the linked - // portion, then adjusted. I think this should be robust, but - // needs checking at edges. - - if i > 0 { - let mut rem_i = i; - while let Some((sp, x)) = spx.pop_front() { - if rem_i > x { - rem_i -= x; - } else if rem_i == x { - sourcepos.end.column = sp.end.column; - rem_i = 0; - break; - } else { - // rem_i < x - assert_eq!(sp.end.column - sp.start.column + 1, x); - sourcepos.end.column = sp.start.column + rem_i - 1; - spx.push_front(( - ( - sp.start.line, - sp.start.column + rem_i, - sp.end.line, - sp.end.column, - ) - .into(), - x - rem_i, - )); - rem_i = 0; - break; - } - } - assert!(rem_i == 0); - } else { - sourcepos.end.column = 0; - } + sourcepos.end.column = if i > 0 { consume_spx(&mut spx, i) } else { 0 }; contents_str.truncate(i); let nsp: Sourcepos = ( @@ -134,6 +80,8 @@ pub(crate) fn process_email_autolinks<'a>( post.first_child().unwrap().data.borrow_mut().sourcepos = nsp; if let Some(remain) = remain { + consume_spx(&mut spx, skip); + let mut asp: Sourcepos = ( sourcepos.end.line, nsp.end.column + 1, @@ -141,34 +89,6 @@ pub(crate) fn process_email_autolinks<'a>( initial_end_col, ) .into(); - - // Consume `skip` from spx. - let mut rem_skip = skip; - while let Some((sp, x)) = spx.pop_front() { - if rem_skip > x { - rem_skip -= x; - } else if rem_skip == x { - rem_skip = 0; - break; - } else { - // rem_skip < x - assert_eq!(sp.end.column - sp.start.column + 1, x); - spx.push_front(( - ( - sp.start.line, - sp.start.column + rem_skip, - sp.end.line, - sp.end.column, - ) - .into(), - x - rem_skip, - )); - rem_skip = 0; - break; - } - } - assert!(rem_skip == 0); - let after = make_inline(arena, NodeValue::Text(remain.to_string()), asp); post.insert_after(after); @@ -192,6 +112,52 @@ pub(crate) fn process_email_autolinks<'a>( } } +// Sourcepos end column `e` of the original node (set by writing to +// `*sourcepos`) determined by advancing through `spx` until `i` bytes of input +// are seen. +// +// For each element `(sp, x)` in `spx`: +// - if remaining `i` is greater than the byte count `x`, +// set `i -= x` and continue. +// - if remaining `i` is equal to the byte count `x`, +// set `e = sp.end.column` and finish. +// - if remaining `i` is less than the byte count `x`, +// assert `sp.end.column - sp.start.column + 1 == x` (1), +// set `e = sp.start.column + i - 1` (2) and finish. +// +// (1) If `x` doesn't equal the range covered between the start and end column, +// there's no way to determine sourcepos within the range. This is a bug if +// it happens; it suggests we've matched an email autolink with some smart +// punctuation in it, or worse. +// +// (2) A little iffy on the way I've added `- 1` --- what we calculate here +// technically is the start column of the linked portion, then adjusted. I +// think this should be robust, but needs checking at edges. +fn consume_spx(spx: &mut VecDeque<(Sourcepos, usize)>, mut rem: usize) -> usize { + while let Some((sp, x)) = spx.pop_front() { + if rem > x { + rem -= x; + } else if rem == x { + return sp.end.column; + } else { + // rem < x + assert_eq!(sp.end.column - sp.start.column + 1, x); + spx.push_front(( + ( + sp.start.line, + sp.start.column + rem, + sp.end.line, + sp.end.column, + ) + .into(), + x - rem, + )); + return sp.start.column + rem - 1; + } + } + unreachable!(); +} + fn email_match<'a>( arena: &'a Arena>, contents: &[u8], From b86e2242923e90e8e3c3647d9375b05aed27d2ee Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 20:41:06 +1100 Subject: [PATCH 21/30] sourcepos: good grief. --- src/parser/autolink.rs | 6 +++++- src/parser/inlines.rs | 10 ++++++---- src/strings.rs | 9 +++++++-- src/tests/fuzz.rs | 29 +++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index 4f6a0b98..3ffd8f7b 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -64,7 +64,11 @@ pub(crate) fn process_email_autolinks<'a>( }; let initial_end_col = sourcepos.end.column; - sourcepos.end.column = if i > 0 { consume_spx(&mut spx, i) } else { 0 }; + sourcepos.end.column = if i > 0 { + consume_spx(&mut spx, i) + } else { + sourcepos.start.column - 1 + }; contents_str.truncate(i); let nsp: Sourcepos = ( diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 29c4e3fa..4e17f4ac 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -291,16 +291,16 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { '$' => Some(self.handle_dollars(&node_ast.line_offsets)), '|' if self.options.extension.spoiler => Some(self.handle_delim(b'|')), _ => { - let endpos = self.find_special_char(); + let mut endpos = self.find_special_char(); let mut contents = self.input[self.pos..endpos].to_vec(); - let startpos = self.pos; + let mut startpos = self.pos; self.pos = endpos; if self .peek_char() .map_or(false, |&c| strings::is_line_end_char(c)) { - strings::rtrim(&mut contents); + endpos -= strings::rtrim(&mut contents); } // if we've just produced a LineBreak, then we should consume any leading @@ -308,7 +308,9 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { if node.last_child().map_or(false, |n| { matches!(n.data.borrow().value, NodeValue::LineBreak) }) { - strings::ltrim(&mut contents); + // TODO: test this more explicitly. + let n = strings::ltrim(&mut contents); + startpos += n; } Some(self.make_inline( diff --git a/src/strings.rs b/src/strings.rs index 0a13db95..68fc7f82 100644 --- a/src/strings.rs +++ b/src/strings.rs @@ -144,17 +144,19 @@ pub fn chop_trailing_hashtags(line: &mut Vec) { } } -pub fn rtrim(line: &mut Vec) { +pub fn rtrim(line: &mut Vec) -> usize { let spaces = line.iter().rev().take_while(|&&b| isspace(b)).count(); let new_len = line.len() - spaces; line.truncate(new_len); + spaces } -pub fn ltrim(line: &mut Vec) { +pub fn ltrim(line: &mut Vec) -> usize { let spaces = line.iter().take_while(|&&b| isspace(b)).count(); shift_buf_left(line, spaces); let new_len = line.len() - spaces; line.truncate(new_len); + spaces } pub fn trim(line: &mut Vec) { @@ -191,6 +193,9 @@ pub fn trim_slice(mut i: &[u8]) -> &[u8] { } fn shift_buf_left(buf: &mut [u8], n: usize) { + if n == 0 { + return; + } assert!(n <= buf.len()); let keep = buf.len() - n; unsafe { diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 10601e88..6bde3f12 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -226,3 +226,32 @@ fn echaw2() { ]) ); } + +#[test] +fn echaw3() { + assert_ast_match!( + [extension.autolink, parse.smart], + // XXX As an extra special case, NUL bytes are expanded to U+FFFD + // REPLACEMENT CHARACTER (UTF-8: EF BF BD) during the feed stage, so + // sourcepos sees three bytes (!). I might like to change this later. + "c@.r\0\t\r" + "z \n" + " f@.x", + (document (1:1-3:5) [ + (paragraph (1:1-3:5) [ + (link (1:1-1:4) "mailto:c@.r" [ + (text (1:1-1:4) "c@.r") + ]) + (text (1:5-1:7) "�") + // !! Spaces at EOL are trimmed. + // See parser::inlines::Subject::parse_inline's final case. + (softbreak (1:9-1:9)) + (text (2:1-2:1) "z") + (linebreak (2:2-2:4)) + (link (3:2-3:5) "mailto:f@.x" [ + (text (3:2-3:5) "f@.x") + ]) + ]) + ]) + ); +} From dca38dcd15cd0e7c8136842217c35f6fdadc8bfc Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 18:33:36 +1100 Subject: [PATCH 22/30] autolink: no longer iffy on this; record another question. --- src/parser/autolink.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index 3ffd8f7b..60e7200c 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -64,11 +64,7 @@ pub(crate) fn process_email_autolinks<'a>( }; let initial_end_col = sourcepos.end.column; - sourcepos.end.column = if i > 0 { - consume_spx(&mut spx, i) - } else { - sourcepos.start.column - 1 - }; + sourcepos.end.column = consume_spx(&mut spx, i); contents_str.truncate(i); let nsp: Sourcepos = ( @@ -84,6 +80,8 @@ pub(crate) fn process_email_autolinks<'a>( post.first_child().unwrap().data.borrow_mut().sourcepos = nsp; if let Some(remain) = remain { + // TODO: why we don't need the return value here? What's the + // relationship between it, nsp.end.column, and initial_end_col? consume_spx(&mut spx, skip); let mut asp: Sourcepos = ( @@ -127,16 +125,12 @@ pub(crate) fn process_email_autolinks<'a>( // set `e = sp.end.column` and finish. // - if remaining `i` is less than the byte count `x`, // assert `sp.end.column - sp.start.column + 1 == x` (1), -// set `e = sp.start.column + i - 1` (2) and finish. +// set `e = sp.start.column + i - 1` and finish. // // (1) If `x` doesn't equal the range covered between the start and end column, // there's no way to determine sourcepos within the range. This is a bug if // it happens; it suggests we've matched an email autolink with some smart // punctuation in it, or worse. -// -// (2) A little iffy on the way I've added `- 1` --- what we calculate here -// technically is the start column of the linked portion, then adjusted. I -// think this should be robust, but needs checking at edges. fn consume_spx(spx: &mut VecDeque<(Sourcepos, usize)>, mut rem: usize) -> usize { while let Some((sp, x)) = spx.pop_front() { if rem > x { From cba4264d4935cdcd0e092bb72c46c0c410b57dbd Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 21:25:47 +1100 Subject: [PATCH 23/30] autolink: answer this question. --- src/parser/autolink.rs | 5 ++--- src/tests/fuzz.rs | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index 60e7200c..05185c9a 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -80,9 +80,8 @@ pub(crate) fn process_email_autolinks<'a>( post.first_child().unwrap().data.borrow_mut().sourcepos = nsp; if let Some(remain) = remain { - // TODO: why we don't need the return value here? What's the - // relationship between it, nsp.end.column, and initial_end_col? - consume_spx(&mut spx, skip); + let rs = consume_spx(&mut spx, skip); + assert_eq!(rs, nsp.end.column); let mut asp: Sourcepos = ( sourcepos.end.line, diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 6bde3f12..c25974af 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -85,7 +85,6 @@ fn trailing_hyphen() { #[test] fn trailing_smart_endash_matches() { - // TODO: "\ at EOL" style break breaks sourcepos assert_ast_match!( [extension.autolink, parse.smart], "--\n" From 87aa79c6831a03e68b93585cfce09ff806da80b6 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 21:29:19 +1100 Subject: [PATCH 24/30] tests: new failing test; entities are cursed. --- src/tests/fuzz.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index c25974af..3f313855 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -254,3 +254,16 @@ fn echaw3() { ]) ); } + +#[test] +fn echaw4() { + assert_ast_match!( + [extension.autolink, parse.smart], + "-@_.e--", + (document (1:1-1:16) [ + (paragraph (1:1-1:16) [ + (text (1:1-1:16) "-@_.e–") // underbar & en-dash + ]) + ]) + ); +} From 82cb6e6387511e8185eed9c8f924ad469de56710 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Thu, 27 Feb 2025 18:33:36 +1100 Subject: [PATCH 25/30] autolink: consume_spx to find new end.column. --- src/parser/autolink.rs | 8 ++++---- src/tests/fuzz.rs | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index 05185c9a..6f629ecb 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -66,12 +66,15 @@ pub(crate) fn process_email_autolinks<'a>( sourcepos.end.column = consume_spx(&mut spx, i); + let nsp_end_col = consume_spx(&mut spx, skip); + contents_str.truncate(i); + let nsp: Sourcepos = ( sourcepos.end.line, sourcepos.end.column + 1, sourcepos.end.line, - sourcepos.end.column + skip, + nsp_end_col, ) .into(); post.data.borrow_mut().sourcepos = nsp; @@ -80,9 +83,6 @@ pub(crate) fn process_email_autolinks<'a>( post.first_child().unwrap().data.borrow_mut().sourcepos = nsp; if let Some(remain) = remain { - let rs = consume_spx(&mut spx, skip); - assert_eq!(rs, nsp.end.column); - let mut asp: Sourcepos = ( sourcepos.end.line, nsp.end.column + 1, diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 3f313855..656491ac 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -257,12 +257,16 @@ fn echaw3() { #[test] fn echaw4() { + // _ resolves to a plain ASCII underscore "_". assert_ast_match!( [extension.autolink, parse.smart], "-@_.e--", (document (1:1-1:16) [ (paragraph (1:1-1:16) [ - (text (1:1-1:16) "-@_.e–") // underbar & en-dash + (link (1:1-1:14) "mailto:-@_.e" [ + (text (1:1-1:14) "-@_.e") + ]) + (text (1:15-1:16) "–") // en-dash ]) ]) ); From 642ba7be2c8134f5409d213384d785deefbc112e Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Fri, 28 Feb 2025 14:45:04 +1100 Subject: [PATCH 26/30] inlines: first fixes to emphasis sourcepos. --- src/parser/inlines.rs | 44 ++++++++++++++++++++++++++---------------- src/tests/fuzz.rs | 38 ++++++++++++++++++++++++++++++++++++ src/tests/sourcepos.rs | 4 +++- 3 files changed, 68 insertions(+), 18 deletions(-) diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 4e17f4ac..646e639c 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -99,6 +99,21 @@ pub struct Delimiter<'a: 'd, 'd> { next: Cell>>, } +impl<'a: 'd, 'd> std::fmt::Debug for Delimiter<'a, 'd> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "[pos {}, len {}, delim_char {:?}, open? {} close? {} -- {}]", + self.position, + self.length, + self.delim_char, + self.can_open, + self.can_close, + self.inl.data.borrow().sourcepos + ) + } +} + struct Bracket<'a> { inl_text: &'a AstNode<'a>, position: usize, @@ -1136,22 +1151,14 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.pos, self.pos, ); - { - // if we have `___` or `***` then we need to adjust the sourcepos colums by 1 - let triple_adjustment = if opener_num_chars > 0 && use_delims == 2 { - 1 - } else { - 0 - }; - emph.data.borrow_mut().sourcepos = ( - opener.inl.data.borrow().sourcepos.start.line, - opener.inl.data.borrow().sourcepos.start.column + triple_adjustment, - closer.inl.data.borrow().sourcepos.end.line, - closer.inl.data.borrow().sourcepos.end.column - triple_adjustment, - ) - .into(); - } + emph.data.borrow_mut().sourcepos = ( + opener.inl.data.borrow().sourcepos.start.line, + opener.inl.data.borrow().sourcepos.start.column + opener_num_chars, + closer.inl.data.borrow().sourcepos.end.line, + closer.inl.data.borrow().sourcepos.end.column - closer_num_chars, + ) + .into(); // Drop all the interior AST nodes into the emphasis node // and then insert the emphasis node @@ -1167,11 +1174,13 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { } opener.inl.insert_after(emph); - // Drop the delimiters and return the next closer to process - + // Drop completely "used up" delimiters, adjust sourcepos of those not, + // and return the next closest one for processing. if opener_num_chars == 0 { opener.inl.detach(); self.remove_delimiter(opener); + } else { + opener.inl.data.borrow_mut().sourcepos.end.column -= use_delims; } if closer_num_chars == 0 { @@ -1179,6 +1188,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c> Subject<'a, 'r, 'o, 'd, 'i, 'c> { self.remove_delimiter(closer); closer.next.get() } else { + closer.inl.data.borrow_mut().sourcepos.start.column += use_delims; Some(closer) } } diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 656491ac..52cf02f8 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -271,3 +271,41 @@ fn echaw4() { ]) ); } + +#[test] +fn echaw5() { + assert_ast_match!( + [], + // [extension.autolink], + "_#___@e.u", + (document (1:1-1:9) [ + (paragraph (1:1-1:9) [ + (emph (1:1-1:3) [ + (text (1:2-1:2) "#") + ]) + // (link (1:1-1:14) "mailto:-@_.e" [ + // (text (1:1-1:14) "-@_.e") + // ]) + (text (1:4-1:9) "__@e.u") + ]) + ]) + ); +} + +#[test] +fn echaw6() { + assert_ast_match!( + [extension.autolink], + "_#___@e.u", + (document (1:1-1:9) [ + (paragraph (1:1-1:9) [ + (emph (1:1-1:3) [ + (text (1:2-1:2) "#") + ]) + (link (1:4-1:9) "mailto:__@e.u" [ + (text (1:4-1:9) "__@e.u") + ]) + ]) + ]) + ); +} diff --git a/src/tests/sourcepos.rs b/src/tests/sourcepos.rs index 19aeb619..5c61468b 100644 --- a/src/tests/sourcepos.rs +++ b/src/tests/sourcepos.rs @@ -394,8 +394,10 @@ const SPOILERED_TEXT: TestCase = ( after"#, ); +// NOTE: I've adjusted this from its original asserted sourcepos (2:1-2:8) while +// fixing emphasis sourcepos. I am not even sure what it is, really. const ESCAPED_TAG: TestCase = ( - &[sourcepos!((2:1-2:8))], + &[sourcepos!((2:2-2:8))], r#"before ||hello| after"#, From 91ad747eb0693192400d33a460f15dbcafde8325 Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Sun, 2 Mar 2025 13:41:40 +1100 Subject: [PATCH 27/30] autolink: identify & permit `rem == 0` case. --- src/parser/autolink.rs | 8 ++++++-- src/tests/fuzz.rs | 19 +++++++++++++++---- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index 6f629ecb..72e2e1e3 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -123,13 +123,17 @@ pub(crate) fn process_email_autolinks<'a>( // - if remaining `i` is equal to the byte count `x`, // set `e = sp.end.column` and finish. // - if remaining `i` is less than the byte count `x`, -// assert `sp.end.column - sp.start.column + 1 == x` (1), +// assert `sp.end.column - sp.start.column + 1 == x || rem == 0` (1), // set `e = sp.start.column + i - 1` and finish. // // (1) If `x` doesn't equal the range covered between the start and end column, // there's no way to determine sourcepos within the range. This is a bug if // it happens; it suggests we've matched an email autolink with some smart // punctuation in it, or worse. +// +// The one exception is if `rem == 0`. Given nothing to consume, we can +// happily restore what we popped, returning `sp.start.column - 1` for the +// end column of the original node. fn consume_spx(spx: &mut VecDeque<(Sourcepos, usize)>, mut rem: usize) -> usize { while let Some((sp, x)) = spx.pop_front() { if rem > x { @@ -138,7 +142,7 @@ fn consume_spx(spx: &mut VecDeque<(Sourcepos, usize)>, mut rem: usize) -> usize return sp.end.column; } else { // rem < x - assert_eq!(sp.end.column - sp.start.column + 1, x); + assert!((sp.end.column - sp.start.column + 1 == x) || rem == 0); spx.push_front(( ( sp.start.line, diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 52cf02f8..191ce8df 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -276,16 +276,12 @@ fn echaw4() { fn echaw5() { assert_ast_match!( [], - // [extension.autolink], "_#___@e.u", (document (1:1-1:9) [ (paragraph (1:1-1:9) [ (emph (1:1-1:3) [ (text (1:2-1:2) "#") ]) - // (link (1:1-1:14) "mailto:-@_.e" [ - // (text (1:1-1:14) "-@_.e") - // ]) (text (1:4-1:9) "__@e.u") ]) ]) @@ -309,3 +305,18 @@ fn echaw6() { ]) ); } + +#[test] +fn echaw7() { + assert_ast_match!( + [extension.autolink], + "Ai@i.a", + (document (1:1-1:10) [ + (paragraph (1:1-1:10) [ + (link (1:1-1:10) "mailto:Ai@i.a" [ + (text (1:1-1:10) "Ai@i.a") + ]) + ]) + ]) + ); +} From c8d935e610bf43fb700cea54d6603e42d1d0805f Mon Sep 17 00:00:00 2001 From: Talya Connor Date: Sun, 2 Mar 2025 13:41:40 +1100 Subject: [PATCH 28/30] tests: add new failure (tasklist). --- src/tests/fuzz.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index 191ce8df..b2567dbd 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -320,3 +320,21 @@ fn echaw7() { ]) ); } + +#[test] +fn echaw8() { + // fuzz/artifacts/all_options/minimized-from-57c3eaf5e03b3fd7fa971b0db6143ee3c21a7452 + assert_ast_match!( + [extension.autolink, extension.tasklist], + "- [x] 𝔛- Date: Sun, 2 Mar 2025 13:41:40 +1100 Subject: [PATCH 29/30] parser: correct tasklist spx. --- src/parser/autolink.rs | 56 +++-------------------------------- src/parser/mod.rs | 67 +++++++++++++++++++++++++++++++++++++----- src/tests/fuzz.rs | 9 ++++-- 3 files changed, 70 insertions(+), 62 deletions(-) diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index 72e2e1e3..294d6621 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -1,8 +1,7 @@ use crate::character_set::character_set; use crate::ctype::{isalnum, isalpha, isspace}; use crate::nodes::{AstNode, NodeLink, NodeValue, Sourcepos}; -use crate::parser::inlines::make_inline; -use std::collections::VecDeque; +use crate::parser::{inlines::make_inline, Spx}; use std::str; use typed_arena::Arena; use unicode_categories::UnicodeCategories; @@ -13,7 +12,7 @@ pub(crate) fn process_email_autolinks<'a>( contents_str: &mut String, relaxed_autolinks: bool, sourcepos: &mut Sourcepos, - mut spx: VecDeque<(Sourcepos, usize)>, + spx: &mut Spx, ) { let contents = contents_str.as_bytes(); let len = contents.len(); @@ -64,9 +63,9 @@ pub(crate) fn process_email_autolinks<'a>( }; let initial_end_col = sourcepos.end.column; - sourcepos.end.column = consume_spx(&mut spx, i); + sourcepos.end.column = spx.consume(i); - let nsp_end_col = consume_spx(&mut spx, skip); + let nsp_end_col = spx.consume(skip); contents_str.truncate(i); @@ -112,53 +111,6 @@ pub(crate) fn process_email_autolinks<'a>( } } } - -// Sourcepos end column `e` of the original node (set by writing to -// `*sourcepos`) determined by advancing through `spx` until `i` bytes of input -// are seen. -// -// For each element `(sp, x)` in `spx`: -// - if remaining `i` is greater than the byte count `x`, -// set `i -= x` and continue. -// - if remaining `i` is equal to the byte count `x`, -// set `e = sp.end.column` and finish. -// - if remaining `i` is less than the byte count `x`, -// assert `sp.end.column - sp.start.column + 1 == x || rem == 0` (1), -// set `e = sp.start.column + i - 1` and finish. -// -// (1) If `x` doesn't equal the range covered between the start and end column, -// there's no way to determine sourcepos within the range. This is a bug if -// it happens; it suggests we've matched an email autolink with some smart -// punctuation in it, or worse. -// -// The one exception is if `rem == 0`. Given nothing to consume, we can -// happily restore what we popped, returning `sp.start.column - 1` for the -// end column of the original node. -fn consume_spx(spx: &mut VecDeque<(Sourcepos, usize)>, mut rem: usize) -> usize { - while let Some((sp, x)) = spx.pop_front() { - if rem > x { - rem -= x; - } else if rem == x { - return sp.end.column; - } else { - // rem < x - assert!((sp.end.column - sp.start.column + 1 == x) || rem == 0); - spx.push_front(( - ( - sp.start.line, - sp.start.column + rem, - sp.end.line, - sp.end.column, - ) - .into(), - x - rem, - )); - return sp.start.column + rem - 1; - } - } - unreachable!(); -} - fn email_match<'a>( arena: &'a Arena>, contents: &[u8], diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 188ff2a8..1bb6f801 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2961,14 +2961,14 @@ where // Join adjacent text nodes together, then post-process. // Record the original list of sourcepos and bytecounts // for the post-processing step. - let mut spx = VecDeque::new(); - spx.push_back((sourcepos, root.len())); + let mut spxv = VecDeque::new(); + spxv.push_back((sourcepos, root.len())); while let Some(ns) = n.next_sibling() { match ns.data.borrow().value { NodeValue::Text(ref adj) => { root.push_str(adj); let sp = ns.data.borrow().sourcepos; - spx.push_back((sp, adj.len())); + spxv.push_back((sp, adj.len())); sourcepos.end.column = sp.end.column; ns.detach(); } @@ -2976,7 +2976,7 @@ where } } - self.postprocess_text_node(n, root, &mut sourcepos, spx); + self.postprocess_text_node(n, root, &mut sourcepos, spxv); emptied = root.len() == 0; } NodeValue::Link(..) | NodeValue::Image(..) | NodeValue::WikiLink(..) => { @@ -3011,10 +3011,11 @@ where node: &'a AstNode<'a>, text: &mut String, sourcepos: &mut Sourcepos, - spx: VecDeque<(Sourcepos, usize)>, + spxv: VecDeque<(Sourcepos, usize)>, ) { + let mut spx = Spx(spxv); if self.options.extension.tasklist { - self.process_tasklist(node, text, sourcepos); + self.process_tasklist(node, text, sourcepos, &mut spx); } if self.options.extension.autolink { @@ -3024,7 +3025,7 @@ where text, self.options.parse.relaxed_autolinks, sourcepos, - spx, + &mut spx, ); } } @@ -3034,6 +3035,7 @@ where node: &'a AstNode<'a>, text: &mut String, sourcepos: &mut Sourcepos, + spx: &mut Spx, ) { let (end, symbol) = match scanners::tasklist(text.as_bytes()) { Some(p) => p, @@ -3071,6 +3073,8 @@ where // the count thereof (i.e. "end") will precisely map to characters in // the source document. sourcepos.start.column += end; + let reference = spx.consume(end) + 1; + assert_eq!(reference, sourcepos.start.column); parent.data.borrow_mut().sourcepos.start.column += end; grandparent.data.borrow_mut().value = @@ -3323,3 +3327,52 @@ pub enum ListStyleType { /// The `*` character Star = 42, } + +pub(crate) struct Spx(VecDeque<(Sourcepos, usize)>); + +impl Spx { + // Sourcepos end column `e` of a node determined by advancing through `spx` + // until `i` bytes of input are seen. + // + // For each element `(sp, x)` in `spx`: + // - if remaining `i` is greater than the byte count `x`, + // set `i -= x` and continue. + // - if remaining `i` is equal to the byte count `x`, + // set `e = sp.end.column` and finish. + // - if remaining `i` is less than the byte count `x`, + // assert `sp.end.column - sp.start.column + 1 == x || i == 0` (1), + // set `e = sp.start.column + i - 1` and finish. + // + // (1) If `x` doesn't equal the range covered between the start and end column, + // there's no way to determine sourcepos within the range. This is a bug if + // it happens; it suggests we've matched an email autolink with some smart + // punctuation in it, or worse. + // + // The one exception is if `i == 0`. Given nothing to consume, we can + // happily restore what we popped, returning `sp.start.column - 1` for the + // end column of the original node. + pub(crate) fn consume(&mut self, mut rem: usize) -> usize { + while let Some((sp, x)) = self.0.pop_front() { + if rem > x { + rem -= x; + } else if rem == x { + return sp.end.column; + } else { + // rem < x + assert!((sp.end.column - sp.start.column + 1 == x) || rem == 0); + self.0.push_front(( + ( + sp.start.line, + sp.start.column + rem, + sp.end.line, + sp.end.column, + ) + .into(), + x - rem, + )); + return sp.start.column + rem - 1; + } + } + unreachable!(); + } +} diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index b2567dbd..b99db429 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -329,9 +329,12 @@ fn echaw8() { "- [x] 𝔛- Date: Mon, 3 Mar 2025 07:37:36 +1100 Subject: [PATCH 30/30] tests: add new failure ( ). --- src/tests/fuzz.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/tests/fuzz.rs b/src/tests/fuzz.rs index b99db429..3bd3f5b6 100644 --- a/src/tests/fuzz.rs +++ b/src/tests/fuzz.rs @@ -341,3 +341,24 @@ fn echaw8() { ]), ); } + +#[test] +fn echaw9() { + // fuzz/artifacts/all_options/minimized-from-8a07a44ba1f971ec39d0c14d377c78c2535c6fd5 + assert_ast_match!( + [extension.autolink, extension.tasklist], + "-\t[ ] ", + (document (1:1-1:15) [ + (list (1:1-1:15) [ + (taskitem (1:1-1:15) [ + (paragraph (1:7-1:17) [ + (text (1:7-1:13) "𝔛-<") + (link (1:14-1:17) "mailto:A@.N" [ + (text (1:14-1:17) "A@.N") + ]) + ]) + ]) + ]) + ]), + ); +}