wms/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java

60 lines
1.9 KiB
Java
Raw Normal View History

package extraction;
import model.TextElement;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class PositionedTextStripper extends PDFTextStripper {
private final List<TextElement> elements = new ArrayList<>();
private int currentPage = 0;
private float currentPageWidth = 0;
private float currentPageHeight = 0;
@Override
protected void startPage(PDPage page) throws IOException {
currentPage++;
PDRectangle mediaBox = page.getMediaBox();
currentPageWidth = mediaBox.getWidth();
currentPageHeight = mediaBox.getHeight();
super.startPage(page);
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
if (textPositions == null || textPositions.isEmpty()) {
super.writeString(text, textPositions);
return;
}
TextPosition first = textPositions.get(0);
TextPosition last = textPositions.get(textPositions.size() - 1);
TextElement elem = new TextElement();
elem.setText(text.trim());
elem.setPageNum(currentPage);
elem.setX(first.getX());
elem.setY(first.getY());
elem.setWidth(Math.abs((last.getX() + last.getWidth()) - first.getX()));
elem.setHeight(Math.max(1, Math.abs(first.getHeight())));
elem.setFontSize(first.getFontSizeInPt());
elem.setPageWidth(currentPageWidth);
elem.setPageHeight(currentPageHeight);
if (!elem.getText().isEmpty()) {
elements.add(elem);
}
super.writeString(text, textPositions);
}
public List<TextElement> getElements() {
return elements;
}
}