2026-05-06 15:49:28 +08:00
|
|
|
package extraction;
|
|
|
|
|
|
2026-05-06 16:51:03 +08:00
|
|
|
import lombok.Getter;
|
2026-05-06 15:49:28 +08:00
|
|
|
import model.TextElement;
|
|
|
|
|
import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
|
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|
|
|
|
import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
|
|
import org.apache.pdfbox.text.TextPosition;
|
|
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
|
|
|
|
|
public class PositionedTextStripper extends PDFTextStripper {
|
2026-05-06 16:51:03 +08:00
|
|
|
@Getter
|
2026-05-06 15:49:28 +08:00
|
|
|
private final List<TextElement> elements = new ArrayList<>();
|
|
|
|
|
private int currentPage = 0;
|
|
|
|
|
private float currentPageWidth = 0;
|
|
|
|
|
private float currentPageHeight = 0;
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
protected void startPage(PDPage page) throws IOException {
|
|
|
|
|
currentPage++;
|
|
|
|
|
PDRectangle mediaBox = page.getMediaBox();
|
|
|
|
|
currentPageWidth = mediaBox.getWidth();
|
|
|
|
|
currentPageHeight = mediaBox.getHeight();
|
|
|
|
|
super.startPage(page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
|
|
if (textPositions == null || textPositions.isEmpty()) {
|
|
|
|
|
super.writeString(text, textPositions);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TextPosition first = textPositions.get(0);
|
|
|
|
|
TextPosition last = textPositions.get(textPositions.size() - 1);
|
|
|
|
|
|
|
|
|
|
TextElement elem = new TextElement();
|
|
|
|
|
elem.setText(text.trim());
|
|
|
|
|
elem.setPageNum(currentPage);
|
|
|
|
|
elem.setX(first.getX());
|
|
|
|
|
elem.setY(first.getY());
|
|
|
|
|
elem.setWidth(Math.abs((last.getX() + last.getWidth()) - first.getX()));
|
|
|
|
|
elem.setHeight(Math.max(1, Math.abs(first.getHeight())));
|
|
|
|
|
elem.setFontSize(first.getFontSizeInPt());
|
|
|
|
|
elem.setPageWidth(currentPageWidth);
|
|
|
|
|
elem.setPageHeight(currentPageHeight);
|
|
|
|
|
|
|
|
|
|
if (!elem.getText().isEmpty()) {
|
|
|
|
|
elements.add(elem);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
super.writeString(text, textPositions);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|