Closes #55

sylvainhalle · Aug 3, 2019 · 666a073 · 666a073 · matze-dd · Aug 6, 2019
1 parent bd38349
commit 666a073
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 7 deletions.
diff --git a/Source/Core/src/ca/uqac/lif/textidote/cleaning/latex/LatexCleaner.java b/Source/Core/src/ca/uqac/lif/textidote/cleaning/latex/LatexCleaner.java
@@ -26,6 +26,7 @@
 import java.util.regex.Pattern;
 
 import ca.uqac.lif.textidote.as.AnnotatedString;
+import ca.uqac.lif.textidote.as.Match;
 import ca.uqac.lif.textidote.as.Position;
 import ca.uqac.lif.textidote.cleaning.TextCleaner;
 import ca.uqac.lif.textidote.cleaning.TextCleanerException;
@@ -319,13 +320,12 @@ protected AnnotatedString removeMarkup(AnnotatedString as_out, int line_pos)
 		as_out = as_out.replaceAll("~", " ");
 		// Dots
 		as_out = as_out.replaceAll("\\\\(dots|cdots|ldots)", "...");
-		// Inline equations with only digits are replaced by digits or letters
-		as_out = as_out.replaceAll("([^\\\\])\\$([\\d,a-zA-z]+?)\\$", "$1$2");
-		as_out = as_out.replaceAll("^\\$([\\d,a-zA-z]+?)\\$", "$1");
-		// Other inline equations are replaced by "X"
-		as_out = as_out.replaceAll("([^\\\\])\\$.*?[^\\\\]\\$", "$1X");
-		as_out = as_out.replaceAll("^\\$.*?[^\\\\]\\$", "X");
-		as_out = as_out.replaceAll("\\\\\\(.*?\\\\\\)", "X");
+		// Inline equations are replaced by "X"
+		as_out = replaceInlineEquations(as_out, line_pos);
+		/*as_out = as_out.replaceAll("([^\\\\])\\$.*?[^\\\\]\\$", "$1X");
+		//as_out = as_out.replaceAll("^\\$.*?[^\\\\]\\$", "X");
+		as_out = as_out.replaceAll("^\\$([^\\$]|\\.)*\\$", "X");
+		as_out = as_out.replaceAll("\\\\\\(.*?\\\\\\)", "X");*/
 		// Commands we can ignore
 		as_out = as_out.replaceAll("\\\\\\w+\\{", "");
 		//as_out = as_out.replaceAll("\\\\(title|textbf|textit|emph|uline|section|subsection|subsubsection|paragraph)", "");
@@ -334,6 +334,45 @@ protected AnnotatedString removeMarkup(AnnotatedString as_out, int line_pos)
 		return as_out;
 	}
 
+	protected AnnotatedString replaceInlineEquations(AnnotatedString as_out, int line_pos)
+	{
+		Match m = null;
+		Position p = Position.ZERO;
+		do
+		{
+			m = as_out.find("[^\\\\]\\$.*?[^\\\\]\\$", p);
+			if (m == null)
+			{
+				break;
+			}
+			p = m.getPosition();
+			String s_from = m.getMatch();
+			String s_to = s_from.substring(0, 1) + "X";
+			String s_inside = s_from.substring(2, s_from.length() - 1);
+			if (s_inside.matches("[\\dA-Za-z\\.,]+"))
+			{
+				s_to = s_from.substring(0, 1) + s_inside;
+			}
+			as_out = as_out.replaceAll(Pattern.quote(s_from), s_to);
+			p = p.moveBy(1); // To ensure progress
+		} while (m != null);
+		// Do it one last time for equations at the beginning of a line		
+		m = as_out.find("^\\$.*?[^\\\\]\\$", p);
+		if (m != null)
+		{
+			p = m.getPosition();
+			String s_from = m.getMatch();
+			String s_to = "X";
+			String s_inside = s_from.substring(1, s_from.length() - 1);
+			if (s_inside.matches("[\\dA-Za-z\\.,]+"))
+			{
+				s_to = s_inside;
+			}
+			as_out = as_out.replaceAll(Pattern.quote(s_from), s_to);
+		}
+		return as_out;
+	}
+
 	/**
 	 * Replaces escaped accented character sequences by their proper character
 	 * @param as_out The string to replace from

diff --git a/Source/CoreTest/src/ca/uqac/lif/textidote/cleaning/CleanerTest.java b/Source/CoreTest/src/ca/uqac/lif/textidote/cleaning/CleanerTest.java
@@ -64,6 +64,46 @@ public void testRemoveMarkup3() throws TextCleanerException
 		assertEquals(9, p.getColumn());
 	}
 
+	@Test
+	public void testRemoveMathMarkup1() throws TextCleanerException
+	{
+		LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
+		AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $abc$ world")));
+		assertEquals("Hello abc world", as.toString());
+	}
+
+	@Test
+	public void testRemoveMathMarkup2() throws TextCleanerException
+	{
+		LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
+		AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $3.5$ world")));
+		assertEquals("Hello 3.5 world", as.toString());
+	}
+
+	@Test
+	public void testRemoveMathMarkup3() throws TextCleanerException
+	{
+		LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
+		AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $3.5x$ world")));
+		assertEquals("Hello 3.5x world", as.toString());
+	}
+
+	@Test
+	public void testRemoveMathMarkup4() throws TextCleanerException
+	{
+		LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
+		AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $abc\\theta$ world")));
+		assertEquals("Hello X world", as.toString());
+	}
+
+	@Test
+	public void testRemoveMathMarkup5() throws TextCleanerException
+	{
+		LatexCleaner detexer = new LatexCleaner().setIgnoreBeforeDocument(false);
+		AnnotatedString as = detexer.clean(AnnotatedString.read(new Scanner("Hello $ab\\$$ world")));
+		assertEquals("Hello X world", as.toString());
+	}
+
 	@Test
 	public void testRemoveEnvironments1() throws TextCleanerException
 	{