2026-05-06 15:49:28 +08:00
|
|
|
package extraction;
|
|
|
|
|
|
|
|
|
|
import model.TextElement;
|
|
|
|
|
import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
|
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|
|
|
|
import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
|
|
import org.apache.pdfbox.text.TextPosition;
|
2026-05-11 19:03:24 +08:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
|
import org.slf4j.LoggerFactory;
|
2026-05-06 15:49:28 +08:00
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
|
|
|
|
|
public class PositionedTextStripper extends PDFTextStripper {
|
2026-05-11 19:03:24 +08:00
|
|
|
|
|
|
|
|
private static final Logger log = LoggerFactory.getLogger(PositionedTextStripper.class);
|
|
|
|
|
|
2026-05-06 15:49:28 +08:00
|
|
|
private final List<TextElement> elements = new ArrayList<>();
|
2026-05-11 19:03:24 +08:00
|
|
|
private int seqCounter = 0;
|
2026-05-06 15:49:28 +08:00
|
|
|
private int currentPage = 0;
|
|
|
|
|
private float currentPageWidth = 0;
|
|
|
|
|
private float currentPageHeight = 0;
|
|
|
|
|
|
2026-05-11 19:03:24 +08:00
|
|
|
public PositionedTextStripper() throws IOException {
|
|
|
|
|
super();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// @Override
|
|
|
|
|
// protected void processTextPosition(TextPosition text) {
|
|
|
|
|
// log.info("processTextPosition: {},unicode={},codes={},font={},embedded={},damaged={}"
|
|
|
|
|
// , text, text.getUnicode(), text.getCharacterCodes(), text.getFont().getName(), text.getFont().isEmbedded(), text.getFont().isDamaged());
|
|
|
|
|
// }
|
|
|
|
|
|
2026-05-06 15:49:28 +08:00
|
|
|
@Override
|
|
|
|
|
protected void startPage(PDPage page) throws IOException {
|
|
|
|
|
currentPage++;
|
|
|
|
|
PDRectangle mediaBox = page.getMediaBox();
|
|
|
|
|
currentPageWidth = mediaBox.getWidth();
|
|
|
|
|
currentPageHeight = mediaBox.getHeight();
|
|
|
|
|
super.startPage(page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
|
|
if (textPositions == null || textPositions.isEmpty()) {
|
|
|
|
|
super.writeString(text, textPositions);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2026-05-11 19:03:24 +08:00
|
|
|
// log.info("writeString: {}", text);
|
|
|
|
|
|
|
|
|
|
// Rebuild text from individual TextPositions and apply garbled-pattern corrections.
|
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
|
|
|
for (TextPosition tp : textPositions) {
|
|
|
|
|
String u = tp.getUnicode();
|
|
|
|
|
if (u != null) {
|
|
|
|
|
sb.append(u);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
String correctedText = correctGarbledText(sb.toString());
|
|
|
|
|
|
|
|
|
|
// Log original vs corrected text at INFO level for diagnostics
|
|
|
|
|
// log.info("writeString: raw=[{}] corrected=[{}]", text, correctedText);
|
|
|
|
|
|
|
|
|
|
// Debug: log each TextPosition's unicode with code points AND font name
|
|
|
|
|
if (log.isDebugEnabled()) {
|
|
|
|
|
for (TextPosition tp : textPositions) {
|
|
|
|
|
String u = tp.getUnicode();
|
|
|
|
|
if (u != null && !u.isEmpty() && u.charAt(0) > 0x7F) {
|
|
|
|
|
String fn = tp.getFont() != null ? tp.getFont().getName() : "null";
|
|
|
|
|
log.debug("TextPos: unicode={} codePoints={} fontSize={} font={}",
|
|
|
|
|
u, toCodePoints(u), String.format("%.1f", tp.getFontSizeInPt()), fn);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-05-06 15:49:28 +08:00
|
|
|
|
|
|
|
|
TextPosition first = textPositions.get(0);
|
|
|
|
|
TextPosition last = textPositions.get(textPositions.size() - 1);
|
|
|
|
|
|
|
|
|
|
TextElement elem = new TextElement();
|
2026-05-11 19:03:24 +08:00
|
|
|
elem.setText(correctedText.trim());
|
2026-05-06 15:49:28 +08:00
|
|
|
elem.setPageNum(currentPage);
|
|
|
|
|
elem.setX(first.getX());
|
|
|
|
|
elem.setY(first.getY());
|
|
|
|
|
elem.setWidth(Math.abs((last.getX() + last.getWidth()) - first.getX()));
|
|
|
|
|
elem.setHeight(Math.max(1, Math.abs(first.getHeight())));
|
|
|
|
|
elem.setFontSize(first.getFontSizeInPt());
|
|
|
|
|
elem.setPageWidth(currentPageWidth);
|
|
|
|
|
elem.setPageHeight(currentPageHeight);
|
|
|
|
|
|
|
|
|
|
if (!elem.getText().isEmpty()) {
|
2026-05-11 19:03:24 +08:00
|
|
|
elem.setSeqNum(seqCounter++);
|
2026-05-06 15:49:28 +08:00
|
|
|
elements.add(elem);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
super.writeString(text, textPositions);
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-11 19:03:24 +08:00
|
|
|
/**
|
|
|
|
|
* Correct common garbled text patterns produced by PDFBox on Windows.
|
|
|
|
|
* <p>
|
|
|
|
|
* When a PDF uses Symbol / engineering fonts, the glyph codes are
|
|
|
|
|
* sometimes misinterpreted through the platform's default charset
|
|
|
|
|
* (GBK on Chinese Windows, Latin-1 on Western Windows), producing
|
|
|
|
|
* garbled character pairs or individual garbled chars.
|
|
|
|
|
* <p>
|
|
|
|
|
* Pair replacements are tried first; if the pair is split across
|
|
|
|
|
* separate writeString() calls, individual char fallbacks apply.
|
|
|
|
|
*/
|
|
|
|
|
private String correctGarbledText(String text) {
|
|
|
|
|
if (text == null || text.isEmpty()) return text;
|
|
|
|
|
|
|
|
|
|
String result = text
|
|
|
|
|
// ---- Pair replacements (both chars in same writeString call) ----
|
|
|
|
|
.replace("\u00a1\u00a4", "\u03a6") // ¡¤ -> Φ (diameter)
|
|
|
|
|
.replace("\u00a1\u00e3", "\u00b0") // ¡ã -> ° (degree)
|
|
|
|
|
.replace("\u00a1\u00c0", "\u00b1") // ¡À -> ± (plus-minus)
|
|
|
|
|
.replace("\u00a6\u00b5", "\u03a6") // ¦µ -> Φ (variant)
|
|
|
|
|
.replace("\u00a1\u00c1", "\u00b1") // ¡Á -> ± (variant)
|
|
|
|
|
.replace("\uffc3n\uffc3", "\u03a6") // Mac garbled Φ
|
|
|
|
|
.replace("\uffc3$\uffc3", "\u03a6") // Mac garbled Φ
|
|
|
|
|
.replace("\ufffdn\ufffd", "\u00d8") // Mac garbled Ø
|
|
|
|
|
// ---- Fallback: individual char replacements ----
|
|
|
|
|
// When ¡¤ is split across separate writeString() calls,
|
|
|
|
|
// each char appears alone. Standalone ¡ -> Φ, ¤ -> removed.
|
|
|
|
|
.replace("\u00a1", "\u03a6") // ¡ -> Φ
|
|
|
|
|
.replace("\u00a4", "") // ¤ -> removed
|
|
|
|
|
// ---- Remove Unicode REPLACEMENT CHARACTER ----
|
|
|
|
|
.replace("\ufffd", "");
|
|
|
|
|
|
|
|
|
|
// Diagnostic log at INFO level when a correction was made
|
|
|
|
|
if (!result.equals(text)) {
|
|
|
|
|
log.info("correctGarbledText: [{}] -> [{}]", toCodePoints(text), toCodePoints(result));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static String toCodePoints(String s) {
|
|
|
|
|
if (s == null) return "null";
|
|
|
|
|
StringBuilder sb = new StringBuilder("[");
|
|
|
|
|
for (int i = 0; i < s.codePointCount(0, s.length()); i++) {
|
|
|
|
|
if (i > 0) sb.append(" ");
|
|
|
|
|
sb.append(String.format("U+%04X", s.codePointAt(s.offsetByCodePoints(0, i))));
|
|
|
|
|
}
|
|
|
|
|
sb.append("]");
|
|
|
|
|
return sb.toString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public List<TextElement> getElements() {
|
|
|
|
|
return elements;
|
|
|
|
|
}
|
2026-05-06 15:49:28 +08:00
|
|
|
}
|