refactor(extraction): 优化尺寸识别与文本预处理逻辑

- 使用统一Unicode字符替代多种编码符号，提升尺寸与公差识别的准确性 - 增加自动推断直径符号Φ的启发式方法，支持根据配合公差上下文自动标记 - 添加查找前导符号函数，处理分开文本元素中的工程符号与数字合并 - 引入文本序号seqNum，用于排序过滤和错误匹配排除 - 在文本合并逻辑中允许工程符号忽略字体大小差异合并文本单元 - 新增文本纠错函数，修正PDF符号字体乱码，提升解析文本质量 - 完善公差识别与尺寸字符串拼接的内部逻辑，统一±符号为Unicode编码 - 为文本元素添加seqNum属性，支持序号管理与日志打印 - 采纳文本归一化处理，做NFC标准化，修正编码混乱文本 - 杜绝无用的日志打印，保留必要信息用于调试与后续支持
2026-05-11 19:03:24 +08:00 · 2026-05-11 19:03:24 +08:00 · f67ccb5796
parent eec0ad801e
commit f67ccb5796
5 changed files with 379 additions and 41 deletions
--- a/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java
+++ b/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java
@ -1,8 +1,11 @@
 package extraction;

-
+import lombok.extern.slf4j.Slf4j;
 import model.DimensionResult;
+import model.TextElement;
 import model.TextGroup;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.util.ArrayList;
 import java.util.HashSet;
@ -11,31 +14,32 @@ import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

+@Slf4j
 public class DimensionIdentifier {

    // 尺寸 + 对称公差
    private static final Pattern PAT_DIM_SYM_TOL = Pattern.compile(
-            "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*[±]\\s*(\\d+\\.?\\d*)");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*[\u00B1]\\s*(\\d+\\.?\\d*)");

    // 尺寸 + 非对称公差（斜线分隔）
    private static final Pattern PAT_DIM_ASYM_TOL = Pattern.compile(
-            "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([+-]\\d+\\.?\\d*)\\s*/\\s*([+-]\\d+\\.?\\d*)");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*([+-]\\d+\\.?\\d*)\\s*/\\s*([+-]\\d+\\.?\\d*)");

    // 尺寸 + 非对称公差（空格分隔）
    private static final Pattern PAT_DIM_LIMIT_TOL = Pattern.compile(
-            "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)");

    // 尺寸 + 配合公差代号
    private static final Pattern PAT_DIM_FIT = Pattern.compile(
-            "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([A-HJ-Zj-z]\\d{1,2})\\b");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*([A-HJ-Zj-z]\\d{1,2})\\b");

    // 螺纹标注（M型公制螺纹 + G型管螺纹 + Rc/NPT等）
    private static final Pattern PAT_THREAD = Pattern.compile(
-            "(M\\d+\\.?\\d*(?:\\s*[xX×]\\s*\\d+\\.?\\d*)?|(?:G|Rc|Rp|NPT)\\s*\\d+(?:/\\d+)?)");
+            "(M\\d+\\.?\\d*(?:\\s*[xX\u00D7]\\s*\\d+\\.?\\d*)?|(?:G|Rc|Rp|NPT)\\s*\\d+(?:/\\d+)?)");

    // 独立公差文本
    private static final Pattern PAT_TOLERANCE = Pattern.compile(
-            "[±]\\s*\\d+\\.?\\d*|[+-]\\s*\\d+\\.?\\d*");
+            "[\u00B1]\\s*\\d+\\.?\\d*|[+-]\\s*\\d+\\.?\\d*");

    // 复合公差文本
    private static final Pattern PAT_COMPOUND_TOL = Pattern.compile(
@ -43,7 +47,7 @@ public class DimensionIdentifier {

    // 纯尺寸数值
    private static final Pattern PAT_PLAIN_DIM = Pattern.compile(
-            "([ΦφØ∅]\\s*\\d+\\.?\\d*|[Rr]\\d+\\.?\\d*|[Cc]\\d+\\.?\\d*|\\d+\\.\\d+|\\d+)");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]\\s*\\d+\\.?\\d*|[Rr]\\d+\\.?\\d*|[Cc]\\d+\\.?\\d*|\\d+\\.\\d+|\\d+)");

    public List<DimensionResult> identifyDimensions(List<TextGroup> groups) {
        return identifyDimensions(groups, true);
@ -74,7 +78,7 @@ public class DimensionIdentifier {
                if (TitleBlockFilter.isToleranceOnly(text)) continue;
                if (TitleBlockFilter.isSurfaceRoughness(text)) continue;
                if (TitleBlockFilter.isGdtTolerance(text)) continue;
-                if (text.matches(".*\\d+\\.\\d+\\d+\\.\\d+.*") && !text.contains("±") && !text.contains("/")) continue;
+                if (text.matches(".*\\d+\\.\\d+\\d+\\.\\d+.*") && !text.contains("\u00B1") && !text.contains("/")) continue;
            }
            // === 区域模式（!toleranceOnly）：不做内容过滤，直接进入模式匹配 ===

@ -90,8 +94,15 @@ public class DimensionIdentifier {
            // 1) 对称公差
            m = PAT_DIM_SYM_TOL.matcher(text);
            if (m.find()) {
+                String d1 = m.group(1).trim();
+                String sym1 = d1.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
+                        : findLeadingSymbolGroup(g, groups, i, processed);
+                // Heuristic: if no text-based Φ found, infer from fit tolerance context
+                if (sym1.isEmpty()) {
+                    sym1 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
+                }
                results.add(new DimensionResult(
-                        m.group(1).trim(), "±" + m.group(2), "dimension", g));
+                        sym1 + d1, "\u00B1" + m.group(2), "dimension", g));
                processed.add(i);
                continue;
            }
@ -99,8 +110,14 @@ public class DimensionIdentifier {
            // 2) 非对称公差（斜线）
            m = PAT_DIM_ASYM_TOL.matcher(text);
            if (m.find()) {
+                String d2 = m.group(1).trim();
+                String sym2 = d2.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
+                        : findLeadingSymbolGroup(g, groups, i, processed);
+                if (sym2.isEmpty()) {
+                    sym2 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
+                }
                results.add(new DimensionResult(
-                        m.group(1).trim(), m.group(2) + "/" + m.group(3), "dimension", g));
+                        sym2 + d2, m.group(2) + "/" + m.group(3), "dimension", g));
                processed.add(i);
                continue;
            }
@ -108,8 +125,14 @@ public class DimensionIdentifier {
            // 3) 非对称公差（空格）
            m = PAT_DIM_LIMIT_TOL.matcher(text);
            if (m.find()) {
+                String d3 = m.group(1).trim();
+                String sym3 = d3.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
+                        : findLeadingSymbolGroup(g, groups, i, processed);
+                if (sym3.isEmpty()) {
+                    sym3 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
+                }
                results.add(new DimensionResult(
-                        m.group(1).trim(), m.group(2) + " " + m.group(3), "dimension", g));
+                        sym3 + d3, m.group(2) + " " + m.group(3), "dimension", g));
                processed.add(i);
                continue;
            }
@ -117,8 +140,19 @@ public class DimensionIdentifier {
            // 4) 配合公差
            m = PAT_DIM_FIT.matcher(text);
            if (m.find() && !text.contains("-") && !text.contains("/")) {
+                String d4 = m.group(1).trim();
+                String fitCode = m.group(2);
+                String sym4 = d4.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
+                        : findLeadingSymbolGroup(g, groups, i, processed);
+                // Fit tolerance implies diameter, EXCEPT when fit code starts with
+                // uppercase 'R' (which usually means Radius in engineering drawings)
+                if (sym4.isEmpty() && !fitCode.startsWith("R")) {
+                    sym4 = "\u03A6";
+                } else if (sym4.isEmpty()) {
+                    sym4 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
+                }
                results.add(new DimensionResult(
-                        m.group(1).trim(), m.group(2), "dimension", g));
+                        sym4 + d4, fitCode, "dimension", g));
                processed.add(i);
                continue;
            }
@ -146,7 +180,7 @@ public class DimensionIdentifier {
                    }
                    continue;
                }
-
+            
                String numPart = dim.replaceAll("[^\\d.]", "");
                if (numPart.isEmpty()) continue;
                double val;
@ -157,12 +191,12 @@ public class DimensionIdentifier {
                } catch (NumberFormatException e) {
                    continue;
                }
-
+            
                String nearbyTol = findNearbyTolerance(g, groups, i, usedAsTolerance);
-
+            
                // toleranceOnly 模式下仅输出带公差的尺寸
                if (toleranceOnly && nearbyTol == null) continue;
-                // 非 toleranceOnly 模式下，单字符无公差无Φ符号 → 输出完整文本（而非跳过）
+                // 非 toleranceOnly 模式下，单字符无公差无\u03A6符号 → 输出完整文本（而非跳过）
                if (!toleranceOnly && nearbyTol == null && dim.length() == 1 && !hasNearbyPhiSymbol(g, groups)) {
                    if (text.length() > 1) {
                        results.add(new DimensionResult(text, null, "dimension", g));
@ -170,9 +204,18 @@ public class DimensionIdentifier {
                    }
                    continue;
                }
-
+            
+                // Check if a standalone engineering symbol (\u03a6, \u00d8, etc.) sits
+                // immediately to the LEFT of this number group.  On Windows, PDFBox
+                // often puts the symbol in a separate text run (separate writeString
+                // call) because the garbled \u00a1\u00a4 chars come from a different
+                // font than the number.  After correction the symbol is a standalone
+                // TextElement / TextGroup that TextGrouper may not merge because the
+                // Y delta or X gap exceeds its thresholds.  We handle it here instead.
+                String leadingSymbol = findLeadingSymbolGroup(g, groups, i, processed);
+            
                // 区域模式下使用完整文本（含描述），toleranceOnly 模式只用尺寸值
-                String dimText = (!toleranceOnly && text.length() > dim.length() + 3) ? text : dim;
+                String dimText = (!toleranceOnly && text.length() > dim.length() + 3) ? text : leadingSymbol + dim;
                results.add(new DimensionResult(dimText, nearbyTol, "dimension", g));
                processed.add(i);
                continue;
@ -188,12 +231,114 @@ public class DimensionIdentifier {
        return results;
    }

+    /**
+     * Looks for a standalone engineering-symbol group (Φ, Ø, etc.) that sits
+     * immediately to the LEFT of {@code dimGroup} on the same page.  On Windows
+     * PDFBox often produces these as separate writeString calls because the
+     * garbled char sequence (e.g. ¡¤) comes from a different font / text-run
+     * than the following numeric text.  TextGrouper may fail to merge them due to
+     * Y-delta or X-gap thresholds, so we handle the combination here.
+     *
+     * @return the symbol string (e.g. "Φ") if found, otherwise empty string
+     */
+    private String findLeadingSymbolGroup(TextGroup dimGroup, List<TextGroup> allGroups,
+                                          int dimIndex, Set<Integer> processed) {
+        Set<String> symbols = Set.of(
+                "\u03A6", "\u03C6",   // Phi / phi - diameter
+                "\u00D8", "\u00F8",   // O-stroke uppercase / lowercase (\u00d8 / \u00f8)
+                "\u2205",             // empty-set - sometimes used for diameter
+                "\u2300"              // \u2300 - ISO/DIN standard diameter sign (most CAD software)
+        );
+        // Generous thresholds: the symbol (e.g. \u03a6 at fontSize=24) and the number
+        // (e.g. "280" at fontSize=9) come from different fonts/text-runs, so their
+        // X/Y coordinates may differ more than expected.
+        //
+        // IMPORTANT: we use the symbol's LEFT EDGE (other.getX()), NOT the right edge,
+        // because the symbol TextElement width is computed from the original garbled
+        // chars (e.g. \u00a1\u00a4 from BerlinSansFB-Bold) whose advance widths are
+        // larger than the visual \u03a6 glyph.  This inflated width makes
+        // (symbolRight - dimGroup.getX()) negative even when the symbol visually
+        // precedes the number.
+        //
+        // gapFromSymLeft = dimGroup.getX() - other.getX()
+        //   > 0  : symbol starts LEFT of the number (correct for "\u03a6NN")
+        //   = ~0  : symbol almost aligns with number start
+        //   < -3  : symbol starts to the RIGHT of the number (wrong, reject)
+        float maxGapFromLeft = Math.max(dimGroup.getFontSize() * 5.0f, 70f);
+        float maxYDelta       = Math.max(dimGroup.getFontSize() * 3.0f, 40f);
+    
+        String best     = null;
+        float  bestDist = Float.MAX_VALUE;
+        int    bestIdx  = -1;
+    
+        for (int j = 0; j < allGroups.size(); j++) {
+            if (j == dimIndex || processed.contains(j)) continue;
+            TextGroup other = allGroups.get(j);
+            if (other.getPageNum() != dimGroup.getPageNum()) continue;
+            String t = other.getText().trim();
+            if (!symbols.contains(t)) continue;
+    
+            float symbolRight    = other.getX() + Math.max(other.getWidth(), 1f);
+            float gapFromRight   = dimGroup.getX() - symbolRight;   // kept for diagnostics
+            float gapFromSymLeft = dimGroup.getX() - other.getX();  // reliable: not affected by inflated width
+            float dy             = Math.abs(other.getY() - dimGroup.getY());
+
+            // Compute seqNum for diagnostic logging
+            int phiSeq = other.getElements().isEmpty() ? -1 :
+                    other.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
+            int dimSeq = dimGroup.getElements().isEmpty() ? -1 :
+                    dimGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
+
+            // Rendering-order filter: in AutoCAD PDFs, Φ symbols are rendered in an
+            // early layer and dimension numbers in a much later layer.
+            // Valid pairs have dimSeq - phiSeq >= 180.
+            // This rejects same-layer false positives (e.g. "6" with seqDiff=7)
+            // and cross-annotation false positives (e.g. "160" with seqDiff=173).
+            if (phiSeq >= 0 && dimSeq >= 0 && (dimSeq - phiSeq) < 180) {
+                continue;
+            }
+
+            if (gapFromSymLeft >= -10f && gapFromSymLeft < maxGapFromLeft && dy < maxYDelta) {
+                if (gapFromSymLeft < bestDist) {
+                    bestDist = gapFromSymLeft;
+                    best     = t;
+                    bestIdx  = j;
+                }
+            }
+        }
+    
+        if (best != null) {
+            TextGroup bestGroup = allGroups.get(bestIdx);
+            int bPhiSeq = bestGroup.getElements().isEmpty() ? -1 :
+                    bestGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
+            int bDimSeq = dimGroup.getElements().isEmpty() ? -1 :
+                    dimGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
+            log.info("findLeadingSymbolGroup: MATCHED [{}] for dim [{}] gapFromSymLeft={} dy={} "
+                    + "phiSeq={} dimSeq={} seqDiff={} phiFs={} dimFs={} "
+                    + "phiXY=({},{}) dimXY=({},{})",
+                    best, dimGroup.getText().trim(),
+                    String.format("%.1f", bestDist),
+                    String.format("%.1f", Math.abs(bestGroup.getY() - dimGroup.getY())),
+                    bPhiSeq, bDimSeq,
+                    (bPhiSeq >= 0 && bDimSeq >= 0) ? Math.abs(bPhiSeq - bDimSeq) : -1,
+                    String.format("%.1f", bestGroup.getFontSize()),
+                    String.format("%.1f", dimGroup.getFontSize()),
+                    String.format("%.1f", bestGroup.getX()),
+                    String.format("%.1f", bestGroup.getY()),
+                    String.format("%.1f", dimGroup.getX()),
+                    String.format("%.1f", dimGroup.getY()));
+            processed.add(bestIdx);
+            return best;
+        }
+        return "";
+    }
+
    private boolean hasNearbyPhiSymbol(TextGroup dimGroup, List<TextGroup> allGroups) {
        float searchDist = dimGroup.getFontSize() * 3.0f;
        for (TextGroup other : allGroups) {
            if (other.getPageNum() != dimGroup.getPageNum()) continue;
            String t = other.getText().trim();
-            if (!t.equals("¡¤") && !t.equals("Φ") && !t.equals("φ") && !t.equals("Ø") && !t.equals("∅"))
+            if (!t.equals("\u00A1\u00A4") && !t.equals("\u03A6") && !t.equals("\u03C6") && !t.equals("\u00D8") && !t.equals("\u2205"))
                continue;
            float dx = Math.abs(other.getX() - dimGroup.getX());
            float dy = Math.abs(other.getY() - dimGroup.getY());
@ -202,6 +347,43 @@ public class DimensionIdentifier {
        return false;
    }

+    /**
+     * Heuristic: infer that a dimension is a diameter based on context.
+     * In engineering drawings, fit tolerance codes (H7, m6, r6, etc.) are ONLY
+     * used for cylindrical features (shafts/holes). If the dimension text itself
+     * contains a fit code, or a nearby TextGroup on the same page contains one,
+     * we can safely infer it's a diameter.
+     *
+     * Fit tolerance pattern: single letter [A-Ha-h, j-z, J-N, P-Z] + 1-2 digits
+     * (excludes 'I/i' which is not used in ISO fit system)
+     */
+    private static final Pattern PAT_FIT_CODE = Pattern.compile(
+            "(?:^|\\s)([A-Ha-hJ-Nj-zP-Z])(\\d{1,2})(?:\\s|$)");
+
+    private boolean inferDiameterFromContext(TextGroup dimGroup, List<TextGroup> allGroups, String normalizedText) {
+        // 1) Check if the dimension text itself contains a fit code
+        if (PAT_FIT_CODE.matcher(normalizedText).find()) {
+            return true;
+        }
+
+        // 2) Check nearby TextGroups for fit tolerance codes
+        float searchX = 100f;
+        float searchY = 50f;
+        for (TextGroup other : allGroups) {
+            if (other == dimGroup) continue;
+            if (other.getPageNum() != dimGroup.getPageNum()) continue;
+            float dx = Math.abs(other.getX() - dimGroup.getX());
+            float dy = Math.abs(other.getY() - dimGroup.getY());
+            if (dx > searchX || dy > searchY) continue;
+            String ot = other.getText().trim();
+            // Fit code TextGroups are typically short (2-3 chars like "H7", "m6")
+            if (ot.length() >= 2 && ot.length() <= 3 && ot.matches("[A-Ha-hJ-Nj-zP-Z]\\d{1,2}")) {
+                return true;
+            }
+        }
+        return false;
+    }
+
    private String findNearbyTolerance(TextGroup dimGroup, List<TextGroup> allGroups,
                                        int dimIndex, Set<Integer> usedAsTolerance) {
        float effWidth = dimGroup.getWidth() > 0 ? dimGroup.getWidth()
@ -258,7 +440,7 @@ public class DimensionIdentifier {

            if (TitleBlockFilter.isGdtTolerance(otherText) && !bareSmallDecimal) continue;

-            if (other.getFontSize() <= dimGroup.getFontSize() * 0.9 || otherText.contains("±") || bareSmallDecimal) {
+            if (other.getFontSize() <= dimGroup.getFontSize() * 0.9 || otherText.contains("\u00B1") || bareSmallDecimal) {

                Matcher cm = PAT_COMPOUND_TOL.matcher(otherText);
                if (cm.matches()) {
@ -286,7 +468,7 @@ public class DimensionIdentifier {
                }

                if (bareSmallDecimal) {
-                    tolParts.add("±" + otherText);
+                    tolParts.add("\u00B1" + otherText);
                    tolIndices.add(i);
                    continue;
                }
@ -296,8 +478,8 @@ public class DimensionIdentifier {
        if (!tolParts.isEmpty()) {
            usedAsTolerance.addAll(tolIndices);
            tolParts.sort((a, b) -> {
-                boolean aPos = a.startsWith("+") || a.startsWith("±");
-                boolean bPos = b.startsWith("+") || b.startsWith("±");
+                boolean aPos = a.startsWith("+") || a.startsWith("\u00B1");
+                boolean bPos = b.startsWith("+") || b.startsWith("\u00B1");
                return Boolean.compare(bPos, aPos);
            });
            return String.join(" / ", tolParts);
--- a/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java
+++ b/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java
@ -1,23 +1,37 @@
 package extraction;

-import lombok.Getter;
 import model.TextElement;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 public class PositionedTextStripper extends PDFTextStripper {
-    @Getter
+
+    private static final Logger log = LoggerFactory.getLogger(PositionedTextStripper.class);
+
    private final List<TextElement> elements = new ArrayList<>();
+    private int seqCounter = 0;
    private int currentPage = 0;
    private float currentPageWidth = 0;
    private float currentPageHeight = 0;

+    public PositionedTextStripper() throws IOException {
+        super();
+    }
+
+//    @Override
+//    protected void processTextPosition(TextPosition text) {
+//        log.info("processTextPosition: {},unicode={},codes={},font={},embedded={},damaged={}"
+//                , text, text.getUnicode(), text.getCharacterCodes(), text.getFont().getName(), text.getFont().isEmbedded(), text.getFont().isDamaged());
+//    }
+
    @Override
    protected void startPage(PDPage page) throws IOException {
        currentPage++;
@ -33,12 +47,38 @@ public class PositionedTextStripper extends PDFTextStripper {
            super.writeString(text, textPositions);
            return;
        }
+//        log.info("writeString: {}", text);
+
+        // Rebuild text from individual TextPositions and apply garbled-pattern corrections.
+        StringBuilder sb = new StringBuilder();
+        for (TextPosition tp : textPositions) {
+            String u = tp.getUnicode();
+            if (u != null) {
+                sb.append(u);
+            }
+        }
+        String correctedText = correctGarbledText(sb.toString());
+
+        // Log original vs corrected text at INFO level for diagnostics
+//        log.info("writeString: raw=[{}] corrected=[{}]", text, correctedText);
+
+        // Debug: log each TextPosition's unicode with code points AND font name
+        if (log.isDebugEnabled()) {
+            for (TextPosition tp : textPositions) {
+                String u = tp.getUnicode();
+                if (u != null && !u.isEmpty() && u.charAt(0) > 0x7F) {
+                    String fn = tp.getFont() != null ? tp.getFont().getName() : "null";
+                    log.debug("TextPos: unicode={} codePoints={} fontSize={} font={}",
+                            u, toCodePoints(u), String.format("%.1f", tp.getFontSizeInPt()), fn);
+                }
+            }
+        }

        TextPosition first = textPositions.get(0);
        TextPosition last = textPositions.get(textPositions.size() - 1);

        TextElement elem = new TextElement();
-        elem.setText(text.trim());
+        elem.setText(correctedText.trim());
        elem.setPageNum(currentPage);
        elem.setX(first.getX());
        elem.setY(first.getY());
@ -49,10 +89,65 @@ public class PositionedTextStripper extends PDFTextStripper {
        elem.setPageHeight(currentPageHeight);

        if (!elem.getText().isEmpty()) {
+            elem.setSeqNum(seqCounter++);
            elements.add(elem);
        }

        super.writeString(text, textPositions);
    }

+    /**
+     * Correct common garbled text patterns produced by PDFBox on Windows.
+     * <p>
+     * When a PDF uses Symbol / engineering fonts, the glyph codes are
+     * sometimes misinterpreted through the platform's default charset
+     * (GBK on Chinese Windows, Latin-1 on Western Windows), producing
+     * garbled character pairs or individual garbled chars.
+     * <p>
+     * Pair replacements are tried first; if the pair is split across
+     * separate writeString() calls, individual char fallbacks apply.
+     */
+    private String correctGarbledText(String text) {
+        if (text == null || text.isEmpty()) return text;
+
+        String result = text
+                // ---- Pair replacements (both chars in same writeString call) ----
+                .replace("\u00a1\u00a4", "\u03a6")   // ¡¤ -> Φ (diameter)
+                .replace("\u00a1\u00e3", "\u00b0")   // ¡ã -> ° (degree)
+                .replace("\u00a1\u00c0", "\u00b1")   // ¡À -> ± (plus-minus)
+                .replace("\u00a6\u00b5", "\u03a6")   // ¦µ -> Φ (variant)
+                .replace("\u00a1\u00c1", "\u00b1")   // ¡Á -> ± (variant)
+                .replace("\uffc3n\uffc3", "\u03a6")   // Mac garbled Φ
+                .replace("\uffc3$\uffc3", "\u03a6")   // Mac garbled Φ
+                .replace("\ufffdn\ufffd", "\u00d8")   // Mac garbled Ø
+                // ---- Fallback: individual char replacements ----
+                // When ¡¤ is split across separate writeString() calls,
+                // each char appears alone.  Standalone ¡ -> Φ, ¤ -> removed.
+                .replace("\u00a1", "\u03a6")     // ¡ -> Φ
+                .replace("\u00a4", "")           // ¤ -> removed
+                // ---- Remove Unicode REPLACEMENT CHARACTER ----
+                .replace("\ufffd", "");
+
+        // Diagnostic log at INFO level when a correction was made
+        if (!result.equals(text)) {
+            log.info("correctGarbledText: [{}] -> [{}]", toCodePoints(text), toCodePoints(result));
+        }
+
+        return result;
+    }
+
+    private static String toCodePoints(String s) {
+        if (s == null) return "null";
+        StringBuilder sb = new StringBuilder("[");
+        for (int i = 0; i < s.codePointCount(0, s.length()); i++) {
+            if (i > 0) sb.append(" ");
+            sb.append(String.format("U+%04X", s.codePointAt(s.offsetByCodePoints(0, i))));
+        }
+        sb.append("]");
+        return sb.toString();
+    }
+
+    public List<TextElement> getElements() {
+        return elements;
+    }
 }
--- a/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java
+++ b/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java
@ -3,13 +3,22 @@ package extraction;
 import model.TextElement;
 import model.TextGroup;

-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
+import java.util.*;

 public class TextGrouper {

+    /** Short engineering symbols that should merge with adjacent elements regardless of fontSize ratio. */
+    private static final Set<String> ENGINEERING_SYMBOLS = Set.of(
+            "\u03A6", "\u03C6",   // \u03a6 / \u03c6 (Phi/phi - diameter)
+            "\u00D8", "\u00F8",   // \u00d8 / \u00f8 (O-stroke - diameter)
+            "\u2205",             // \u2205 (empty set - diameter)
+            "\u2300",             // \u2300 (⌀ - ISO/DIN standard diameter sign)
+            "\u25A1", "\u25FB", "\u25AD",  // square cross-section symbols
+            "\u00B1",             // \u00b1 (plus-minus)
+            "\u00B0",             // \u00b0 (degree)
+            "\u00D7"              // \u00d7 (multiply)
+    );
+
    public List<TextGroup> groupTextElements(List<TextElement> elements) {
        if (elements.isEmpty()) return Collections.emptyList();

@ -32,7 +41,14 @@ public class TextGrouper {

                float maxFs = Math.max(current.getFontSize(), elem.getFontSize());
                float minFs = Math.min(current.getFontSize(), elem.getFontSize());
-                if (maxFs > 0 && minFs / maxFs < 0.7f) {
+                // Allow engineering symbols (\u03a6, \u00d8, \u00b1, etc.) to merge
+                // with adjacent elements regardless of fontSize ratio.  These symbols
+                // are often rendered at a much larger fontSize than the dimension
+                // numbers they belong to, but they must be in the same TextGroup for
+                // the DimensionIdentifier regex patterns to match (e.g. "\u03a650\u00b10.1").
+                boolean isSymbol = isShortEngineeringSymbol(current.getText())
+                        || isShortEngineeringSymbol(elem.getText());
+                if (maxFs > 0 && minFs / maxFs < 0.7f && !isSymbol) {
                    merge = false;
                } else {
                    boolean curEndsWithDigit = !current.getText().isEmpty()
@ -83,4 +99,14 @@ public class TextGrouper {

        return groups;
    }
+
+    /**
+     * Check if the text is a short engineering symbol (1-2 chars) that should
+     * be allowed to merge with adjacent elements regardless of fontSize ratio.
+     */
+    private boolean isShortEngineeringSymbol(String text) {
+        if (text == null || text.isEmpty()) return false;
+        String trimmed = text.trim();
+        return trimmed.length() <= 2 && ENGINEERING_SYMBOLS.contains(trimmed);
+    }
 }
--- a/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java
+++ b/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java
@ -1,16 +1,48 @@
 package extraction;

+
+import java.text.Normalizer;
+
 public class TextNormalizer {

    public static String normalizeText(String text) {
-        return text
-                .replace("\u00a1\u00a4", "\u03a6")
-                .replace("\u00a1\u00e3", "\u00b0")
-                .replace("\u00a1\u00c0", "\u00b1")
-                .replace("\u00a6\u00b5", "\u03a6")
-                .replace("\uffc3n\uffc3", "\u03a6")
-                .replace("\uffc3$\uffc3", "\u03a6")
-                .replace("\ufffdn\ufffd", "\u00d8")
-                .replace("\ufffd", "");
+        if (text == null || text.isEmpty()) return text;
+
+        String result = text
+                // --- Mac-specific garbled sequences (original) ---
+                .replace("\u00a1\u00a4", "\u03a6")   // \u00a1\u00a4 (\u00a1\u00a4) -> \u03a6 (\u03a6)
+                .replace("\u00a1\u00e3", "\u00b0")   // \u00a1\u00e3 (\u00a1\u00e3) -> \u00b0 (\u00b0)
+                .replace("\u00a1\u00c0", "\u00b1")   // \u00a1\u00c0 (\u00a1\u00c0) -> \u00b1 (\u00b1)
+                .replace("\u00a6\u00b5", "\u03a6")   // \u00a6\u00b5 (\u00a6\u00b5) -> \u03a6 (\u03a6)
+                .replace("\uffc3n\uffc3", "\u03a6") // \uffc3n\uffc3 -> \u03a6
+                .replace("\uffc3$\uffc3", "\u03a6") // \uffc3$\uffc3 -> \u03a6
+                .replace("\ufffdn\ufffd", "\u00d8") // \ufffdn\ufffd -> \u00d8
+
+                // --- Windows / GBK-specific garbled sequences ---
+                // GBK encodes CJK chars as 2-byte sequences; when a PDF glyph
+                // code is misread through GBK, it produces double-byte CJK chars
+                // or Latin-1 pairs that differ from the Mac variants.
+                .replace("\u03a6\u0080", "\u03a6")   // trailing control char after Phi
+                .replace("\u00d8\u0080", "\u00d8")   // trailing control char after O-stroke
+                .replace("\u00b1\u0080", "\u00b1")   // trailing control char after plus-minus
+
+                // Windows-1252 / GBK mis-read of Symbol font glyph codes
+                .replace("\u0086", "\u2020")   // dagger (Symbol 0x86)
+                .replace("\u0087", "\u2021")   // double dagger (Symbol 0x87)
+                .replace("\u0089", "\u2030")   // per mille (Symbol 0x89)
+
+                // Common GBK garbled pairs for engineering symbols
+                .replace("\u00c6\u00f8", "\u00d8")   // \u00c6\u00f8 -> \u00d8
+                .replace("\u00a1\u00c1", "\u00b1")   // variant of \u00a1\u00c0
+
+                // --- Remove remaining garbage ---
+                .replace("\ufffd", "");            // Unicode replacement char
+
+        // NFC normalization: canonical decomposition then recomposition
+        // This standardizes characters that have multiple Unicode representations
+        // (e.g., U+00C5 vs U+0041 U+030A for Å)
+        result = Normalizer.normalize(result, Normalizer.Form.NFC);
+
+        return result;
    }
 }
--- a/nflg-qms-pdf-extract/src/main/java/model/TextElement.java
+++ b/nflg-qms-pdf-extract/src/main/java/model/TextElement.java
@ -4,10 +4,13 @@ import lombok.Data;

@Data
 public class TextElement {
+
    private String text;
    private int pageNum;
    private float x, y;
    private float width, height;
    private float fontSize;
    private float pageWidth, pageHeight;
+    private int seqNum;
+
 }