Skip to content

Commit

Permalink
Merge branch 'page-number-filter'
Browse files Browse the repository at this point in the history
Conflicts:
	src/main/java/technology/tabula/CommandLineApp.java
	src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java
  • Loading branch information
yourpalal committed Sep 16, 2016
2 parents d615cb5 + 965f8ba commit 4899ac8
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Tabula helps you extract tables from PDFs
-p,--pages <PAGES> Comma separated list of ranges, or all.
Examples: --pages 1-3,5-7, --pages 3 or
--pages all. Default is --pages 1
-pn,--rm-page-numbers Attempt to remove page numbers
-r,--spreadsheet Force PDF to be extracted using
spreadsheet-style extraction (if there are
ruling lines separating each cell, as in a PDF
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/technology/tabula/CommandLineApp.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import technology.tabula.detectors.SpreadsheetDetectionAlgorithm;
import technology.tabula.extractors.BasicExtractionAlgorithm;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import technology.tabula.filters.PageNumberFilter;
import technology.tabula.writers.CSVWriter;
import technology.tabula.writers.JSONWriter;
import technology.tabula.writers.TSVWriter;
Expand Down Expand Up @@ -246,6 +247,7 @@ private static TableExtractor createExtractor(CommandLine line) throws ParseExce
extractor.setMethod(CommandLineApp.whichExtractionMethod(line));
extractor.setUseLineReturns(line.hasOption('u'));
extractor.setUseStraightEdges(line.hasOption("detect-horizontal-alignment"));
extractor.setRemovePageNumbers(line.hasOption("rm-page-numbers"));

if (line.hasOption('c')) {
extractor.setVerticalRulingPositions(parseFloatList(line.getOptionValue('c')));
Expand Down Expand Up @@ -283,6 +285,7 @@ public static Options buildOptions() {
o.addOption("r", "spreadsheet", false, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
o.addOption("n", "no-spreadsheet", false, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
o.addOption("i", "silent", false, "Suppress all stderr output.");
o.addOption("pn", "rm-page-numbers", false, "Attempt to remove page numbers");
o.addOption("u", "use-line-returns", false, "Use embedded line returns in cells. (Only in spreadsheet mode.)");
o.addOption("ha", "detect-horizontal-alignment", false, "Detect horizontal alignment of text to improve column detection.");
o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
Expand Down Expand Up @@ -329,6 +332,8 @@ private static class TableExtractor {
private boolean guess = false;
private boolean useLineReturns = false;
private boolean useStraightEdges = false;
private boolean removePagenumbers = false;

private List<Float> verticalRulingPositions = null;
private ExtractionMethod method = ExtractionMethod.BASIC;

Expand All @@ -351,6 +356,10 @@ public void setUseStraightEdges(boolean useStraightEdges) {
this.useStraightEdges = useStraightEdges;
}

public void setRemovePageNumbers(boolean removePagenumbers) {
this.removePagenumbers = removePagenumbers;
}

public void setMethod(ExtractionMethod method) {
this.method = method;
}
Expand All @@ -374,6 +383,10 @@ public List<Table> extractTables(Page page) {

public List<Table> extractTablesBasic(Page page) {
BasicExtractionAlgorithm basicExtractor = new BasicExtractionAlgorithm();
if (removePagenumbers) {
basicExtractor.setLineFilter(new PageNumberFilter());
}

if (guess) {
// guess the page areas to extract using a detection algorithm
// currently we only have a detector that uses spreadsheets to find table areas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import technology.tabula.Table;
import technology.tabula.TextChunk;
import technology.tabula.TextElement;
import technology.tabula.filters.LineFilter;


public class BasicExtractionAlgorithm implements ExtractionAlgorithm {

Expand All @@ -23,13 +25,19 @@ public class BasicExtractionAlgorithm implements ExtractionAlgorithm {
// column positions
private List<Float> columnHintPositions = null;

private LineFilter lineFilter = null;

public BasicExtractionAlgorithm() {
}

public BasicExtractionAlgorithm(List<Ruling> verticalRulings) {
this.verticalRulings = verticalRulings;
}

public void setLineFilter(LineFilter filter) {
this.lineFilter = filter;
}

public List<Table> extract(Page page, List<Float> verticalRulingPositions) {
this.verticalRulings = Ruling.verticalRulingsAt(verticalRulingPositions, page);
return this.extract(page);
Expand All @@ -49,6 +57,10 @@ public List<Table> extract(Page page) {
}

List<Line> lines = TextChunk.groupByLines(extractTextChunks(page));
if (lineFilter != null) {
lines = lineFilter.filterLines(lines);
}

List<Float> columns = null;

if (this.verticalRulings != null) {
Expand Down Expand Up @@ -116,7 +128,6 @@ private List<TextChunk> extractTextChunks(Page page) {
}
}


public static List<Rectangle> columnRegions(List<Line> lines) {
List<Rectangle> regions = new ArrayList<Rectangle>();
for (TextChunk tc: lines.get(0).getTextElements()) {
Expand Down Expand Up @@ -176,5 +187,4 @@ public static List<java.lang.Float> columnPositions(List<Line> lines) {

return rv;
}

}
9 changes: 9 additions & 0 deletions src/main/java/technology/tabula/filters/LineFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package technology.tabula.filters;

import technology.tabula.Line;
import java.util.List;


public interface LineFilter {
List<Line> filterLines(List<Line> lines);
}
36 changes: 36 additions & 0 deletions src/main/java/technology/tabula/filters/PageNumberFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package technology.tabula.filters;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;

import technology.tabula.Line;
import technology.tabula.TextChunk;


public class PageNumberFilter implements LineFilter {
// regex to match lines that look like " 1/2 "
private static final Pattern PAGE_NUMBER_PATTERN = Pattern.compile("\\A\\s*\\d+\\s*/\\s*\\d+\\s*\\z");


public List<Line> filterLines(List<Line> lines) {
Collections.sort(lines);
// just check first and last lines, to avoid messing up anything in the middle of the page
if (isLineNumber(lines.get(0))) lines.remove(0);
if (isLineNumber(lines.get(lines.size() - 1))) lines.remove(lines.size() - 1);
return lines;
}

private static boolean isLineNumber(Line line) {
return PAGE_NUMBER_PATTERN.matcher(lineTextContent(line)).matches();
}

private static String lineTextContent(Line line) {
StringBuilder sb = new StringBuilder();
for (TextChunk text: line.getTextElements()) {
sb.append(text.getText());
}
return sb.toString();
}
}
51 changes: 51 additions & 0 deletions src/test/java/technology/tabula/TestPageNumberFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package technology.tabula;

import static org.junit.Assert.*;

import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.junit.Test;

import technology.tabula.filters.PageNumberFilter;


public class TestPageNumberFilter {

private Line lineWithText(String text, float y) {
Line line = new Line();
TextElement tElement = new TextElement(y, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, text, 5);
List<TextChunk> tList = new ArrayList<TextChunk>();
tList.add(new TextChunk(tElement));
line.setTextElements(tList);
return line;
}

@Test
public void testRemoveFirstLine() {
List<Line> lines = new ArrayList<Line>();
lines.add(lineWithText("1/2", 0));
Line expectedFirstLine = lineWithText("not a page number", 10);
lines.add(expectedFirstLine);
lines.add(lineWithText("some text", 20));

lines = new PageNumberFilter().filterLines(lines);
assertEquals(2, lines.size());
assertEquals(expectedFirstLine, lines.get(0));
}

@Test
public void testRemoveLastLine() {
List<Line> lines = new ArrayList<Line>();
lines.add(lineWithText("wow", 0));
lines.add(lineWithText("not a page number", 10));
Line expectedLastLine = lineWithText("some text", 20);
lines.add(expectedLastLine);
lines.add(lineWithText("1/2", 30));

lines = new PageNumberFilter().filterLines(lines);
assertEquals(3, lines.size());
assertEquals(expectedLastLine, lines.get(2));
}
}

0 comments on commit 4899ac8

Please sign in to comment.