Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inline sourcepos fixes. #542

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
3337269
sourcepos: fix(?) HtmlInline.
kivikakk Feb 26, 2025
4729f60
math: fix sourcepos test, mark #[ignore].
kivikakk Feb 26, 2025
062865c
inlines: fix Code sourcepos.
kivikakk Feb 26, 2025
888e521
inlines: fix (one) LineBreak sourcepos.
kivikakk Feb 26, 2025
f8abcac
tests: test other LineBreak sourcepos.
kivikakk Feb 26, 2025
447443b
inlines: fix basic autolinks. (Tests failing other links.)
kivikakk Feb 26, 2025
68b2b9a
inlines: fix sourcepos for colon/w autolinks; add tests (email still …
kivikakk Feb 26, 2025
19bcb5e
inlines: more autolink fixes (still no email).
kivikakk Feb 26, 2025
ffd8159
autolink: fix a bunch of sourcepos!
kivikakk Feb 26, 2025
220a902
parser: fix ThematicBreak sourcepos end.
kivikakk Feb 26, 2025
b92a2eb
tests: document list/item sourcepos issues.
kivikakk Feb 26, 2025
30abc1c
inlines: fix Math sourcepos.
kivikakk Feb 27, 2025
31b07ff
inlines: mark some funs as not pub.
kivikakk Feb 27, 2025
ee42217
tests: elaborate on a bunch of AST tests.
kivikakk Feb 27, 2025
33363de
autolink: fix up email sourcepos in presence of smart punctuation.
kivikakk Feb 27, 2025
21eae30
parser: remove empty node left over by postprocessing.
kivikakk Feb 27, 2025
6849733
inlines: fix LineBreak sourcepos from backslash.
kivikakk Feb 27, 2025
4ed12c7
autolink: correct "=" case.
kivikakk Feb 27, 2025
2bb23c0
autolink: deal with consecutive replacements.
kivikakk Feb 27, 2025
1d030cf
autolink: refactor consume_spx.
kivikakk Feb 27, 2025
b86e224
sourcepos: good grief.
kivikakk Feb 27, 2025
dca38dc
autolink: no longer iffy on this; record another question.
kivikakk Feb 27, 2025
cba4264
autolink: answer this question.
kivikakk Feb 27, 2025
87aa79c
tests: new failing test; entities are cursed.
kivikakk Feb 27, 2025
82cb6e6
autolink: consume_spx to find new end.column.
kivikakk Feb 27, 2025
642ba7b
inlines: first fixes to emphasis sourcepos.
kivikakk Feb 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@
formatter = pkgs.alejandra;

devShells.default = pkgs.mkShell {
name = "comrak";

inputsFrom = builtins.attrValues self.checks.${system};

nativeBuildInputs = [
Expand Down
335 changes: 211 additions & 124 deletions src/parser/autolink.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
use crate::character_set::character_set;
use crate::ctype::{isalnum, isalpha, isspace};
use crate::nodes::{AstNode, NodeLink, NodeValue};
use crate::nodes::{AstNode, NodeLink, NodeValue, Sourcepos};
use crate::parser::inlines::make_inline;
use std::collections::VecDeque;
use std::str;
use typed_arena::Arena;
use unicode_categories::UnicodeCategories;

// TODO: this can probably be cleaned up a lot. It used to handle all three of
// {url,www,email}_match, but now just the last of those.
pub(crate) fn process_autolinks<'a>(
pub(crate) fn process_email_autolinks<'a>(
arena: &'a Arena<AstNode<'a>>,
node: &'a AstNode<'a>,
contents_str: &mut String,
relaxed_autolinks: bool,
sourcepos: &mut Sourcepos,
mut spx: VecDeque<(Sourcepos, usize)>,
) {
let contents = contents_str.as_bytes();
let len = contents.len();
Expand Down Expand Up @@ -53,21 +54,221 @@ pub(crate) fn process_autolinks<'a>(
if let Some((post, reverse, skip)) = post_org {
i -= reverse;
node.insert_after(post);
if i + skip < len {

let remain = if i + skip < len {
let remain = str::from_utf8(&contents[i + skip..]).unwrap();
assert!(!remain.is_empty());
post.insert_after(make_inline(
Some(remain.to_string())
} else {
None
};
let initial_end_col = sourcepos.end.column;

sourcepos.end.column = consume_spx(&mut spx, i);

let nsp_end_col = consume_spx(&mut spx, skip);

contents_str.truncate(i);

let nsp: Sourcepos = (
sourcepos.end.line,
sourcepos.end.column + 1,
sourcepos.end.line,
nsp_end_col,
)
.into();
post.data.borrow_mut().sourcepos = nsp;
// Inner text gets same sourcepos as link, since there's nothing but
// the text.
post.first_child().unwrap().data.borrow_mut().sourcepos = nsp;

if let Some(remain) = remain {
let mut asp: Sourcepos = (
sourcepos.end.line,
nsp.end.column + 1,
sourcepos.end.line,
initial_end_col,
)
.into();
let after = make_inline(arena, NodeValue::Text(remain.to_string()), asp);
post.insert_after(after);

let after_ast = &mut after.data.borrow_mut();
process_email_autolinks(
arena,
NodeValue::Text(remain.to_string()),
(0, 1, 0, 1).into(),
));
after,
match after_ast.value {
NodeValue::Text(ref mut t) => t,
_ => unreachable!(),
},
relaxed_autolinks,
&mut asp,
spx,
);
after_ast.sourcepos = asp;
}
contents_str.truncate(i);

return;
}
}
}

// Sourcepos end column `e` of the original node (set by writing to
// `*sourcepos`) determined by advancing through `spx` until `i` bytes of input
// are seen.
//
// For each element `(sp, x)` in `spx`:
// - if remaining `i` is greater than the byte count `x`,
// set `i -= x` and continue.
// - if remaining `i` is equal to the byte count `x`,
// set `e = sp.end.column` and finish.
// - if remaining `i` is less than the byte count `x`,
// assert `sp.end.column - sp.start.column + 1 == x` (1),
// set `e = sp.start.column + i - 1` and finish.
//
// (1) If `x` doesn't equal the range covered between the start and end column,
// there's no way to determine sourcepos within the range. This is a bug if
// it happens; it suggests we've matched an email autolink with some smart
// punctuation in it, or worse.
fn consume_spx(spx: &mut VecDeque<(Sourcepos, usize)>, mut rem: usize) -> usize {
while let Some((sp, x)) = spx.pop_front() {
if rem > x {
rem -= x;
} else if rem == x {
return sp.end.column;
} else {
// rem < x
assert_eq!(sp.end.column - sp.start.column + 1, x);
spx.push_front((
(
sp.start.line,
sp.start.column + rem,
sp.end.line,
sp.end.column,
)
.into(),
x - rem,
));
return sp.start.column + rem - 1;
}
}
unreachable!();
}

fn email_match<'a>(
arena: &'a Arena<AstNode<'a>>,
contents: &[u8],
i: usize,
relaxed_autolinks: bool,
) -> Option<(&'a AstNode<'a>, usize, usize)> {
const EMAIL_OK_SET: [bool; 256] = character_set!(b".+-_");

let size = contents.len();

let mut auto_mailto = true;
let mut is_xmpp = false;
let mut rewind = 0;

while rewind < i {
let c = contents[i - rewind - 1];

if isalnum(c) || EMAIL_OK_SET[c as usize] {
rewind += 1;
continue;
}

if c == b':' {
if validate_protocol("mailto", contents, i - rewind - 1) {
auto_mailto = false;
rewind += 1;
continue;
}

if validate_protocol("xmpp", contents, i - rewind - 1) {
is_xmpp = true;
auto_mailto = false;
rewind += 1;
continue;
}
}

break;
}

if rewind == 0 {
return None;
}

let mut link_end = 1;
let mut np = 0;

while link_end < size - i {
let c = contents[i + link_end];

if isalnum(c) {
// empty
} else if c == b'@' {
return None;
} else if c == b'.' && link_end < size - i - 1 && isalnum(contents[i + link_end + 1]) {
np += 1;
} else if c == b'/' && is_xmpp {
// xmpp allows a `/` in the url
} else if c != b'-' && c != b'_' {
break;
}

link_end += 1;
}

if link_end < 2
|| np == 0
|| (!isalpha(contents[i + link_end - 1]) && contents[i + link_end - 1] != b'.')
{
return None;
}

link_end = autolink_delim(&contents[i..], link_end, relaxed_autolinks);
if link_end == 0 {
return None;
}

let mut url = if auto_mailto {
"mailto:".to_string()
} else {
"".to_string()
};
let text = str::from_utf8(&contents[i - rewind..link_end + i]).unwrap();
url.push_str(text);

let inl = make_inline(
arena,
NodeValue::Link(NodeLink {
url,
title: String::new(),
}),
(0, 1, 0, 1).into(),
);

inl.append(make_inline(
arena,
NodeValue::Text(text.to_string()),
(0, 1, 0, 1).into(),
));
Some((inl, rewind, rewind + link_end))
}

fn validate_protocol(protocol: &str, contents: &[u8], cursor: usize) -> bool {
let size = contents.len();
let mut rewind = 0;

while rewind < cursor && isalpha(contents[cursor - rewind - 1]) {
rewind += 1;
}

size - cursor + rewind >= protocol.len()
&& &contents[cursor - rewind..cursor] == protocol.as_bytes()
}

pub fn www_match<'a>(
arena: &'a Arena<AstNode<'a>>,
contents: &[u8],
Expand Down Expand Up @@ -292,117 +493,3 @@ pub fn url_match<'a>(
));
Some((inl, rewind, rewind + link_end))
}

fn email_match<'a>(
arena: &'a Arena<AstNode<'a>>,
contents: &[u8],
i: usize,
relaxed_autolinks: bool,
) -> Option<(&'a AstNode<'a>, usize, usize)> {
const EMAIL_OK_SET: [bool; 256] = character_set!(b".+-_");

let size = contents.len();

let mut auto_mailto = true;
let mut is_xmpp = false;
let mut rewind = 0;

while rewind < i {
let c = contents[i - rewind - 1];

if isalnum(c) || EMAIL_OK_SET[c as usize] {
rewind += 1;
continue;
}

if c == b':' {
if validate_protocol("mailto", contents, i - rewind - 1) {
auto_mailto = false;
rewind += 1;
continue;
}

if validate_protocol("xmpp", contents, i - rewind - 1) {
is_xmpp = true;
auto_mailto = false;
rewind += 1;
continue;
}
}

break;
}

if rewind == 0 {
return None;
}

let mut link_end = 1;
let mut np = 0;

while link_end < size - i {
let c = contents[i + link_end];

if isalnum(c) {
// empty
} else if c == b'@' {
return None;
} else if c == b'.' && link_end < size - i - 1 && isalnum(contents[i + link_end + 1]) {
np += 1;
} else if c == b'/' && is_xmpp {
// xmpp allows a `/` in the url
} else if c != b'-' && c != b'_' {
break;
}

link_end += 1;
}

if link_end < 2
|| np == 0
|| (!isalpha(contents[i + link_end - 1]) && contents[i + link_end - 1] != b'.')
{
return None;
}

link_end = autolink_delim(&contents[i..], link_end, relaxed_autolinks);
if link_end == 0 {
return None;
}

let mut url = if auto_mailto {
"mailto:".to_string()
} else {
"".to_string()
};
let text = str::from_utf8(&contents[i - rewind..link_end + i]).unwrap();
url.push_str(text);

let inl = make_inline(
arena,
NodeValue::Link(NodeLink {
url,
title: String::new(),
}),
(0, 1, 0, 1).into(),
);

inl.append(make_inline(
arena,
NodeValue::Text(text.to_string()),
(0, 1, 0, 1).into(),
));
Some((inl, rewind, rewind + link_end))
}

fn validate_protocol(protocol: &str, contents: &[u8], cursor: usize) -> bool {
let size = contents.len();
let mut rewind = 0;

while rewind < cursor && isalpha(contents[cursor - rewind - 1]) {
rewind += 1;
}

size - cursor + rewind >= protocol.len()
&& &contents[cursor - rewind..cursor] == protocol.as_bytes()
}
Loading