wms/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java

package extraction;

import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import model.TextElement;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

@Slf4j
public class PositionedTextStripper extends PDFTextStripper {

    @Getter
    private final List<TextElement> elements = new ArrayList<>();
    private int seqCounter = 0;
    private int currentPage = 0;
    private float currentPageWidth = 0;
    private float currentPageHeight = 0;

    public PositionedTextStripper() throws IOException {
        super();
    }

//    @Override
//    protected void processTextPosition(TextPosition text) {
//        log.info("processTextPosition: {},unicode={},codes={},font={},embedded={},damaged={}"
//                , text, text.getUnicode(), text.getCharacterCodes(), text.getFont().getName(), text.getFont().isEmbedded(), text.getFont().isDamaged());
//    }

    @Override
    protected void startPage(PDPage page) throws IOException {
        currentPage++;
        PDRectangle mediaBox = page.getMediaBox();
        currentPageWidth = mediaBox.getWidth();
        currentPageHeight = mediaBox.getHeight();
        super.startPage(page);
    }

    @Override
    protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
        if (textPositions == null || textPositions.isEmpty()) {
            super.writeString(text, textPositions);
            return;
        }

        // Rebuild text from individual TextPositions and apply garbled-pattern corrections.
        StringBuilder sb = new StringBuilder();
        for (TextPosition tp : textPositions) {
            String u = tp.getUnicode();
            if (u != null) {
                sb.append(u);
            }
        }
        String correctedText = correctGarbledText(sb.toString());

        // Debug: log each TextPosition's unicode with code points AND font name
        if (log.isDebugEnabled()) {
            for (TextPosition tp : textPositions) {
                String u = tp.getUnicode();
                if (u != null && !u.isEmpty() && u.charAt(0) > 0x7F) {
                    String fn = tp.getFont() != null ? tp.getFont().getName() : "null";
                    log.debug("TextPos: unicode={} codePoints={} fontSize={} font={}",
                            u, toCodePoints(u), String.format("%.1f", tp.getFontSizeInPt()), fn);
                }
            }
        }

        TextPosition first = textPositions.get(0);
        TextPosition last = textPositions.get(textPositions.size() - 1);

        // Detect text rotation from direction angle
        float dir = first.getDir();
        boolean isVertical = Math.abs(dir - 90) < 1 || Math.abs(dir - 270) < 1;

        TextElement elem = new TextElement();
        elem.setText(correctedText.trim());
        elem.setPageNum(currentPage);
        elem.setX(first.getX());
        elem.setY(first.getY());
        elem.setRotation(dir);

        if (isVertical) {
            // For 90°/270° rotated text, characters are stacked vertically
            // Width should be the Y-span, Height is the character width
            float minY = Float.MAX_VALUE, maxY = Float.MIN_VALUE;
            for (TextPosition tp : textPositions) {
                minY = Math.min(minY, tp.getY());
                maxY = Math.max(maxY, tp.getY() + tp.getHeight());
            }
            elem.setWidth(Math.abs(maxY - minY));
            elem.setHeight(Math.max(1, Math.abs(first.getWidth())));
        } else {
            // Normal horizontal text
            elem.setWidth(Math.abs((last.getX() + last.getWidth()) - first.getX()));
            elem.setHeight(Math.max(1, Math.abs(first.getHeight())));
        }

        elem.setFontSize(first.getFontSizeInPt());
        elem.setPageWidth(currentPageWidth);
        elem.setPageHeight(currentPageHeight);

        if (!elem.getText().isEmpty()) {
            elem.setSeqNum(seqCounter++);
            elements.add(elem);
            if (isVertical && log.isDebugEnabled()) {
                log.debug("ROTATED TEXT: [{}] dir={} x={} y={} w={} h={}",
                        elem.getText(), dir,
                        String.format("%.1f", elem.getX()),
                        String.format("%.1f", elem.getY()),
                        String.format("%.1f", elem.getWidth()),
                        String.format("%.1f", elem.getHeight()));
            }
        }

        super.writeString(text, textPositions);
    }

    /**
     * Correct common garbled text patterns produced by PDFBox on Windows.
     * <p>
     * When a PDF uses Symbol / engineering fonts, the glyph codes are
     * sometimes misinterpreted through the platform's default charset
     * (GBK on Chinese Windows, Latin-1 on Western Windows), producing
     * garbled character pairs or individual garbled chars.
     * <p>
     * Pair replacements are tried first; if the pair is split across
     * separate writeString() calls, individual char fallbacks apply.
     */
    private String correctGarbledText(String text) {
        if (text == null || text.isEmpty()) return text;

        String result = text
                // ---- Pair replacements (both chars in same writeString call) ----
                .replace("\u00a1\u00a4", "\u03a6")   // ¡¤ -> Φ (diameter)
                .replace("\u00a1\u00e3", "\u00b0")   // ¡ã -> ° (degree)
                .replace("\u00a1\u00c0", "\u00b1")   // ¡À -> ± (plus-minus)
                .replace("\u00a6\u00b5", "\u03a6")   // ¦µ -> Φ (variant)
                .replace("\u00a1\u00c1", "\u00b1")   // ¡Á -> ± (variant)
                .replace("\uffc3n\uffc3", "\u03a6")   // Mac garbled Φ
                .replace("\uffc3$\uffc3", "\u03a6")   // Mac garbled Φ
                .replace("\ufffdn\ufffd", "\u00d8")   // Mac garbled Ø
                // ---- Fallback: individual char replacements ----
                // When ¡¤ is split across separate writeString() calls,
                // each char appears alone.  Standalone ¡ -> Φ, ¤ -> removed.
                .replace("\u00a1", "\u03a6")     // ¡ -> Φ
                .replace("\u00a4", "");           // ¤ -> removed

        // ---- Smart handling of Unicode REPLACEMENT CHARACTER (\ufffd) ----
        // In engineering drawings (AutoCAD PDFs), \ufffd before a digit is
        // almost always a Φ (diameter) symbol whose encoding couldn't be decoded.
        // Case 1: \ufffd followed by digit → replace with Φ
        result = result.replaceAll("\ufffd(?=\\d)", "\u03a6");
        // Case 2: text consists entirely of \ufffd → standalone symbol → Φ
        if (result.matches("^\ufffd+$")) {
            result = "\u03a6";
        } else {
            // Case 3: remaining \ufffd mixed with other text → remove
            result = result.replace("\ufffd", "");
        }

        // Diagnostic log at INFO level when a correction was made
        if (!result.equals(text)) {
            log.info("correctGarbledText: [{}] -> [{}]", toCodePoints(text), toCodePoints(result));
        }

        return result;
    }

    private static String toCodePoints(String s) {
        if (s == null) return "null";
        StringBuilder sb = new StringBuilder("[");
        for (int i = 0; i < s.codePointCount(0, s.length()); i++) {
            if (i > 0) sb.append(" ");
            sb.append(String.format("U+%04X", s.codePointAt(s.offsetByCodePoints(0, i))));
        }
        sb.append("]");
        return sb.toString();
    }
}