Skip to content

Commit

Permalink
Closes #55
Browse files Browse the repository at this point in the history
  • Loading branch information
sylvainhalle committed Aug 3, 2019
1 parent bd38349 commit 666a073
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.util.regex.Pattern;

import ca.uqac.lif.textidote.as.AnnotatedString;
import ca.uqac.lif.textidote.as.Match;
import ca.uqac.lif.textidote.as.Position;
import ca.uqac.lif.textidote.cleaning.TextCleaner;
import ca.uqac.lif.textidote.cleaning.TextCleanerException;
Expand Down Expand Up @@ -319,13 +320,12 @@ protected AnnotatedString removeMarkup(AnnotatedString as_out, int line_pos)
as_out = as_out.replaceAll("~", " ");
// Dots
as_out = as_out.replaceAll("\\\\(dots|cdots|ldots)", "...");
// Inline equations with only digits are replaced by digits or letters
as_out = as_out.replaceAll("([^\\\\])\\$([\\d,a-zA-z]+?)\\$", "$1$2");
as_out = as_out.replaceAll("^\\$([\\d,a-zA-z]+?)\\$", "$1");
// Other inline equations are replaced by "X"
as_out = as_out.replaceAll("([^\\\\])\\$.*?[^\\\\]\\$", "$1X");
as_out = as_out.replaceAll("^\\$.*?[^\\\\]\\$", "X");
as_out = as_out.replaceAll("\\\\\\(.*?\\\\\\)", "X");
// Inline equations are replaced by "X"
as_out = replaceInlineEquations(as_out, line_pos);
/*as_out = as_out.replaceAll("([^\\\\])\\$.*?[^\\\\]\\$", "$1X");
//as_out = as_out.replaceAll("^\\$.*?[^\\\\]\\$", "X");
as_out = as_out.replaceAll("^\\$([^\\$]|\\.)*\\$", "X");
as_out = as_out.replaceAll("\\\\\\(.*?\\\\\\)", "X");*/
// Commands we can ignore
as_out = as_out.replaceAll("\\\\\\w+\\{", "");
//as_out = as_out.replaceAll("\\\\(title|textbf|textit|emph|uline|section|subsection|subsubsection|paragraph)", "");
Expand All @@ -334,6 +334,45 @@ protected AnnotatedString removeMarkup(AnnotatedString as_out, int line_pos)
return as_out;
}

protected AnnotatedString replaceInlineEquations(AnnotatedString as_out, int line_pos)
{
Match m = null;
Position p = Position.ZERO;
do
{
m = as_out.find("[^\\\\]\\$.*?[^\\\\]\\$", p);
if (m == null)
{
break;
}
p = m.getPosition();
String s_from = m.getMatch();
String s_to = s_from.substring(0, 1) + "X";
String s_inside = s_from.substring(2, s_from.length() - 1);
if (s_inside.matches("[\\dA-Za-z\\.,]+"))
{
s_to = s_from.substring(0, 1) + s_inside;
}
as_out = as_out.replaceAll(Pattern.quote(s_from), s_to);
p = p.moveBy(1); // To ensure progress
} while (m != null);
// Do it one last time for equations at the beginning of a line
m = as_out.find("^\\$.*?[^\\\\]\\$", p);
if (m != null)
{
p = m.getPosition();
String s_from = m.getMatch();
String s_to = "X";
String s_inside = s_from.substring(1, s_from.length() - 1);
if (s_inside.matches("[\\dA-Za-z\\.,]+"))
{
s_to = s_inside;
}
as_out = as_out.replaceAll(Pattern.quote(s_from), s_to);
}
return as_out;
}

/**
* Replaces escaped accented character sequences by their proper character
* @param as_out The string to replace from
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,46 @@ public void testRemoveMarkup3() throws TextCleanerException
assertEquals(9, p.getColumn());
}

@Test
public void testRemoveMathMarkup1() throws TextCleanerException
{
LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $abc$ world")));
assertEquals("Hello abc world", as.toString());
}

@Test
public void testRemoveMathMarkup2() throws TextCleanerException
{
LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $3.5$ world")));
assertEquals("Hello 3.5 world", as.toString());
}

@Test
public void testRemoveMathMarkup3() throws TextCleanerException
{
LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $3.5x$ world")));
assertEquals("Hello 3.5x world", as.toString());
}

@Test
public void testRemoveMathMarkup4() throws TextCleanerException
{
LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $abc\\theta$ world")));
assertEquals("Hello X world", as.toString());
}

@Test
public void testRemoveMathMarkup5() throws TextCleanerException
{
LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $ab\\$$ world")));
assertEquals("Hello X world", as.toString());
}

@Test
public void testRemoveEnvironments1() throws TextCleanerException
{
Expand Down

1 comment on commit 666a073

@matze-dd
Copy link

@matze-dd matze-dd commented on 666a073 Aug 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Sylvain,

just two comments.

  1. Will translation of '$xyz$ into 'xyz' provoke unnecessary hits from LanguageTool?
  2. Your first regular expression consumes a leading character not being a backslash. Therefore, you have to reconsider the case of a starting string. Most engines provide look-behind, where one requires or excludes a preceding pattern without consuming it. For instance, '(?<!\\)\$' will match a dollar not preceded by a backslash.

Kind regards,
Matthias

Please sign in to comment.