package extraction; import lombok.Getter; import lombok.extern.slf4j.Slf4j; import model.TextElement; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; @Slf4j public class PositionedTextStripper extends PDFTextStripper { @Getter private final List elements = new ArrayList<>(); private int seqCounter = 0; private int currentPage = 0; private float currentPageWidth = 0; private float currentPageHeight = 0; public PositionedTextStripper() throws IOException { super(); } // @Override // protected void processTextPosition(TextPosition text) { // log.info("processTextPosition: {},unicode={},codes={},font={},embedded={},damaged={}" // , text, text.getUnicode(), text.getCharacterCodes(), text.getFont().getName(), text.getFont().isEmbedded(), text.getFont().isDamaged()); // } @Override protected void startPage(PDPage page) throws IOException { currentPage++; PDRectangle mediaBox = page.getMediaBox(); currentPageWidth = mediaBox.getWidth(); currentPageHeight = mediaBox.getHeight(); super.startPage(page); } @Override protected void writeString(String text, List textPositions) throws IOException { if (textPositions == null || textPositions.isEmpty()) { super.writeString(text, textPositions); return; } // Rebuild text from individual TextPositions and apply garbled-pattern corrections. StringBuilder sb = new StringBuilder(); for (TextPosition tp : textPositions) { String u = tp.getUnicode(); if (u != null) { sb.append(u); } } String correctedText = correctGarbledText(sb.toString()); // Debug: log each TextPosition's unicode with code points AND font name if (log.isDebugEnabled()) { for (TextPosition tp : textPositions) { String u = tp.getUnicode(); if (u != null && !u.isEmpty() && u.charAt(0) > 0x7F) { String fn = tp.getFont() != null ? tp.getFont().getName() : "null"; log.debug("TextPos: unicode={} codePoints={} fontSize={} font={}", u, toCodePoints(u), String.format("%.1f", tp.getFontSizeInPt()), fn); } } } TextPosition first = textPositions.get(0); TextPosition last = textPositions.get(textPositions.size() - 1); // Detect text rotation from direction angle float dir = first.getDir(); boolean isVertical = Math.abs(dir - 90) < 1 || Math.abs(dir - 270) < 1; TextElement elem = new TextElement(); elem.setText(correctedText.trim()); elem.setPageNum(currentPage); elem.setX(first.getX()); elem.setY(first.getY()); elem.setRotation(dir); if (isVertical) { // For 90°/270° rotated text, characters are stacked vertically // Width should be the Y-span, Height is the character width float minY = Float.MAX_VALUE, maxY = Float.MIN_VALUE; for (TextPosition tp : textPositions) { minY = Math.min(minY, tp.getY()); maxY = Math.max(maxY, tp.getY() + tp.getHeight()); } elem.setWidth(Math.abs(maxY - minY)); elem.setHeight(Math.max(1, Math.abs(first.getWidth()))); } else { // Normal horizontal text elem.setWidth(Math.abs((last.getX() + last.getWidth()) - first.getX())); elem.setHeight(Math.max(1, Math.abs(first.getHeight()))); } elem.setFontSize(first.getFontSizeInPt()); elem.setPageWidth(currentPageWidth); elem.setPageHeight(currentPageHeight); if (!elem.getText().isEmpty()) { elem.setSeqNum(seqCounter++); elements.add(elem); if (isVertical && log.isDebugEnabled()) { log.debug("ROTATED TEXT: [{}] dir={} x={} y={} w={} h={}", elem.getText(), dir, String.format("%.1f", elem.getX()), String.format("%.1f", elem.getY()), String.format("%.1f", elem.getWidth()), String.format("%.1f", elem.getHeight())); } } super.writeString(text, textPositions); } /** * Correct common garbled text patterns produced by PDFBox on Windows. *

* When a PDF uses Symbol / engineering fonts, the glyph codes are * sometimes misinterpreted through the platform's default charset * (GBK on Chinese Windows, Latin-1 on Western Windows), producing * garbled character pairs or individual garbled chars. *

* Pair replacements are tried first; if the pair is split across * separate writeString() calls, individual char fallbacks apply. */ private String correctGarbledText(String text) { if (text == null || text.isEmpty()) return text; String result = text // ---- Pair replacements (both chars in same writeString call) ---- .replace("\u00a1\u00a4", "\u03a6") // ¡¤ -> Φ (diameter) .replace("\u00a1\u00e3", "\u00b0") // ¡ã -> ° (degree) .replace("\u00a1\u00c0", "\u00b1") // ¡À -> ± (plus-minus) .replace("\u00a6\u00b5", "\u03a6") // ¦µ -> Φ (variant) .replace("\u00a1\u00c1", "\u00b1") // ¡Á -> ± (variant) .replace("\uffc3n\uffc3", "\u03a6") // Mac garbled Φ .replace("\uffc3$\uffc3", "\u03a6") // Mac garbled Φ .replace("\ufffdn\ufffd", "\u00d8") // Mac garbled Ø // ---- Fallback: individual char replacements ---- // When ¡¤ is split across separate writeString() calls, // each char appears alone. Standalone ¡ -> Φ, ¤ -> removed. .replace("\u00a1", "\u03a6") // ¡ -> Φ .replace("\u00a4", ""); // ¤ -> removed // ---- Smart handling of Unicode REPLACEMENT CHARACTER (\ufffd) ---- // In engineering drawings (AutoCAD PDFs), \ufffd before a digit is // almost always a Φ (diameter) symbol whose encoding couldn't be decoded. // Case 1: \ufffd followed by digit → replace with Φ result = result.replaceAll("\ufffd(?=\\d)", "\u03a6"); // Case 2: text consists entirely of \ufffd → standalone symbol → Φ if (result.matches("^\ufffd+$")) { result = "\u03a6"; } else { // Case 3: remaining \ufffd mixed with other text → remove result = result.replace("\ufffd", ""); } // Diagnostic log at INFO level when a correction was made if (!result.equals(text)) { log.info("correctGarbledText: [{}] -> [{}]", toCodePoints(text), toCodePoints(result)); } return result; } private static String toCodePoints(String s) { if (s == null) return "null"; StringBuilder sb = new StringBuilder("["); for (int i = 0; i < s.codePointCount(0, s.length()); i++) { if (i > 0) sb.append(" "); sb.append(String.format("U+%04X", s.codePointAt(s.offsetByCodePoints(0, i)))); } sb.append("]"); return sb.toString(); } }