forked from tabulapdf/tabula-java
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Conflicts: src/main/java/technology/tabula/CommandLineApp.java src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java
- Loading branch information
Showing
6 changed files
with
122 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
package technology.tabula.filters; | ||
|
||
import technology.tabula.Line; | ||
import java.util.List; | ||
|
||
|
||
public interface LineFilter { | ||
List<Line> filterLines(List<Line> lines); | ||
} |
36 changes: 36 additions & 0 deletions
36
src/main/java/technology/tabula/filters/PageNumberFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package technology.tabula.filters; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.regex.Pattern; | ||
|
||
import technology.tabula.Line; | ||
import technology.tabula.TextChunk; | ||
|
||
|
||
public class PageNumberFilter implements LineFilter { | ||
// regex to match lines that look like " 1/2 " | ||
private static final Pattern PAGE_NUMBER_PATTERN = Pattern.compile("\\A\\s*\\d+\\s*/\\s*\\d+\\s*\\z"); | ||
|
||
|
||
public List<Line> filterLines(List<Line> lines) { | ||
Collections.sort(lines); | ||
// just check first and last lines, to avoid messing up anything in the middle of the page | ||
if (isLineNumber(lines.get(0))) lines.remove(0); | ||
if (isLineNumber(lines.get(lines.size() - 1))) lines.remove(lines.size() - 1); | ||
return lines; | ||
} | ||
|
||
private static boolean isLineNumber(Line line) { | ||
return PAGE_NUMBER_PATTERN.matcher(lineTextContent(line)).matches(); | ||
} | ||
|
||
private static String lineTextContent(Line line) { | ||
StringBuilder sb = new StringBuilder(); | ||
for (TextChunk text: line.getTextElements()) { | ||
sb.append(text.getText()); | ||
} | ||
return sb.toString(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
package technology.tabula; | ||
|
||
import static org.junit.Assert.*; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
import org.apache.pdfbox.pdmodel.font.PDType1Font; | ||
import org.junit.Test; | ||
|
||
import technology.tabula.filters.PageNumberFilter; | ||
|
||
|
||
public class TestPageNumberFilter { | ||
|
||
private Line lineWithText(String text, float y) { | ||
Line line = new Line(); | ||
TextElement tElement = new TextElement(y, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, text, 5); | ||
List<TextChunk> tList = new ArrayList<TextChunk>(); | ||
tList.add(new TextChunk(tElement)); | ||
line.setTextElements(tList); | ||
return line; | ||
} | ||
|
||
@Test | ||
public void testRemoveFirstLine() { | ||
List<Line> lines = new ArrayList<Line>(); | ||
lines.add(lineWithText("1/2", 0)); | ||
Line expectedFirstLine = lineWithText("not a page number", 10); | ||
lines.add(expectedFirstLine); | ||
lines.add(lineWithText("some text", 20)); | ||
|
||
lines = new PageNumberFilter().filterLines(lines); | ||
assertEquals(2, lines.size()); | ||
assertEquals(expectedFirstLine, lines.get(0)); | ||
} | ||
|
||
@Test | ||
public void testRemoveLastLine() { | ||
List<Line> lines = new ArrayList<Line>(); | ||
lines.add(lineWithText("wow", 0)); | ||
lines.add(lineWithText("not a page number", 10)); | ||
Line expectedLastLine = lineWithText("some text", 20); | ||
lines.add(expectedLastLine); | ||
lines.add(lineWithText("1/2", 30)); | ||
|
||
lines = new PageNumberFilter().filterLines(lines); | ||
assertEquals(3, lines.size()); | ||
assertEquals(expectedLastLine, lines.get(2)); | ||
} | ||
} |