From f67ccb5796ed0ff1a076fbacfaae4771a3053d67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9B=B9=E9=B9=8F=E9=A3=9E?= <rakor2010@gmail.com>
Date: Mon, 11 May 2026 19:03:24 +0800
Subject: [PATCH 1/2] =?UTF-8?q?refactor(extraction):=20=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E5=B0=BA=E5=AF=B8=E8=AF=86=E5=88=AB=E4=B8=8E=E6=96=87=E6=9C=AC?=
 =?UTF-8?q?=E9=A2=84=E5=A4=84=E7=90=86=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 使用统一Unicode字符替代多种编码符号，提升尺寸与公差识别的准确性
- 增加自动推断直径符号Φ的启发式方法，支持根据配合公差上下文自动标记
- 添加查找前导符号函数，处理分开文本元素中的工程符号与数字合并
- 引入文本序号seqNum，用于排序过滤和错误匹配排除
- 在文本合并逻辑中允许工程符号忽略字体大小差异合并文本单元
- 新增文本纠错函数，修正PDF符号字体乱码，提升解析文本质量
- 完善公差识别与尺寸字符串拼接的内部逻辑，统一±符号为Unicode编码
- 为文本元素添加seqNum属性，支持序号管理与日志打印
- 采纳文本归一化处理，做NFC标准化，修正编码混乱文本
- 杜绝无用的日志打印，保留必要信息用于调试与后续支持
---
 .../java/extraction/DimensionIdentifier.java  | 230 ++++++++++++++++--
 .../extraction/PositionedTextStripper.java    | 101 +++++++-
 .../src/main/java/extraction/TextGrouper.java |  36 ++-
 .../main/java/extraction/TextNormalizer.java  |  50 +++-
 .../src/main/java/model/TextElement.java      |   3 +
 5 files changed, 379 insertions(+), 41 deletions(-)

diff --git a/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java b/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java
index 649cebce..3183b96a 100644
--- a/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java
+++ b/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java
@@ -1,8 +1,11 @@
 package extraction;
 
-
+import lombok.extern.slf4j.Slf4j;
 import model.DimensionResult;
+import model.TextElement;
 import model.TextGroup;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.util.ArrayList;
 import java.util.HashSet;
@@ -11,31 +14,32 @@ import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+@Slf4j
 public class DimensionIdentifier {
 
     // 尺寸 + 对称公差
     private static final Pattern PAT_DIM_SYM_TOL = Pattern.compile(
-            "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*[±]\\s*(\\d+\\.?\\d*)");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*[\u00B1]\\s*(\\d+\\.?\\d*)");
 
     // 尺寸 + 非对称公差（斜线分隔）
     private static final Pattern PAT_DIM_ASYM_TOL = Pattern.compile(
-            "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([+-]\\d+\\.?\\d*)\\s*/\\s*([+-]\\d+\\.?\\d*)");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*([+-]\\d+\\.?\\d*)\\s*/\\s*([+-]\\d+\\.?\\d*)");
 
     // 尺寸 + 非对称公差（空格分隔）
     private static final Pattern PAT_DIM_LIMIT_TOL = Pattern.compile(
-            "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)");
 
     // 尺寸 + 配合公差代号
     private static final Pattern PAT_DIM_FIT = Pattern.compile(
-            "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([A-HJ-Zj-z]\\d{1,2})\\b");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*([A-HJ-Zj-z]\\d{1,2})\\b");
 
     // 螺纹标注（M型公制螺纹 + G型管螺纹 + Rc/NPT等）
     private static final Pattern PAT_THREAD = Pattern.compile(
-            "(M\\d+\\.?\\d*(?:\\s*[xX×]\\s*\\d+\\.?\\d*)?|(?:G|Rc|Rp|NPT)\\s*\\d+(?:/\\d+)?)");
+            "(M\\d+\\.?\\d*(?:\\s*[xX\u00D7]\\s*\\d+\\.?\\d*)?|(?:G|Rc|Rp|NPT)\\s*\\d+(?:/\\d+)?)");
 
     // 独立公差文本
     private static final Pattern PAT_TOLERANCE = Pattern.compile(
-            "[±]\\s*\\d+\\.?\\d*|[+-]\\s*\\d+\\.?\\d*");
+            "[\u00B1]\\s*\\d+\\.?\\d*|[+-]\\s*\\d+\\.?\\d*");
 
     // 复合公差文本
     private static final Pattern PAT_COMPOUND_TOL = Pattern.compile(
@@ -43,7 +47,7 @@ public class DimensionIdentifier {
 
     // 纯尺寸数值
     private static final Pattern PAT_PLAIN_DIM = Pattern.compile(
-            "([ΦφØ∅]\\s*\\d+\\.?\\d*|[Rr]\\d+\\.?\\d*|[Cc]\\d+\\.?\\d*|\\d+\\.\\d+|\\d+)");
+            "([\u03A6\u03C6\u00D8\u2205\u2300]\\s*\\d+\\.?\\d*|[Rr]\\d+\\.?\\d*|[Cc]\\d+\\.?\\d*|\\d+\\.\\d+|\\d+)");
 
     public List<DimensionResult> identifyDimensions(List<TextGroup> groups) {
         return identifyDimensions(groups, true);
@@ -74,7 +78,7 @@ public class DimensionIdentifier {
                 if (TitleBlockFilter.isToleranceOnly(text)) continue;
                 if (TitleBlockFilter.isSurfaceRoughness(text)) continue;
                 if (TitleBlockFilter.isGdtTolerance(text)) continue;
-                if (text.matches(".*\\d+\\.\\d+\\d+\\.\\d+.*") && !text.contains("±") && !text.contains("/")) continue;
+                if (text.matches(".*\\d+\\.\\d+\\d+\\.\\d+.*") && !text.contains("\u00B1") && !text.contains("/")) continue;
             }
             // === 区域模式（!toleranceOnly）：不做内容过滤，直接进入模式匹配 ===
 
@@ -90,8 +94,15 @@ public class DimensionIdentifier {
             // 1) 对称公差
             m = PAT_DIM_SYM_TOL.matcher(text);
             if (m.find()) {
+                String d1 = m.group(1).trim();
+                String sym1 = d1.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
+                        : findLeadingSymbolGroup(g, groups, i, processed);
+                // Heuristic: if no text-based Φ found, infer from fit tolerance context
+                if (sym1.isEmpty()) {
+                    sym1 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
+                }
                 results.add(new DimensionResult(
-                        m.group(1).trim(), "±" + m.group(2), "dimension", g));
+                        sym1 + d1, "\u00B1" + m.group(2), "dimension", g));
                 processed.add(i);
                 continue;
             }
@@ -99,8 +110,14 @@ public class DimensionIdentifier {
             // 2) 非对称公差（斜线）
             m = PAT_DIM_ASYM_TOL.matcher(text);
             if (m.find()) {
+                String d2 = m.group(1).trim();
+                String sym2 = d2.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
+                        : findLeadingSymbolGroup(g, groups, i, processed);
+                if (sym2.isEmpty()) {
+                    sym2 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
+                }
                 results.add(new DimensionResult(
-                        m.group(1).trim(), m.group(2) + "/" + m.group(3), "dimension", g));
+                        sym2 + d2, m.group(2) + "/" + m.group(3), "dimension", g));
                 processed.add(i);
                 continue;
             }
@@ -108,8 +125,14 @@ public class DimensionIdentifier {
             // 3) 非对称公差（空格）
             m = PAT_DIM_LIMIT_TOL.matcher(text);
             if (m.find()) {
+                String d3 = m.group(1).trim();
+                String sym3 = d3.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
+                        : findLeadingSymbolGroup(g, groups, i, processed);
+                if (sym3.isEmpty()) {
+                    sym3 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
+                }
                 results.add(new DimensionResult(
-                        m.group(1).trim(), m.group(2) + " " + m.group(3), "dimension", g));
+                        sym3 + d3, m.group(2) + " " + m.group(3), "dimension", g));
                 processed.add(i);
                 continue;
             }
@@ -117,8 +140,19 @@ public class DimensionIdentifier {
             // 4) 配合公差
             m = PAT_DIM_FIT.matcher(text);
             if (m.find() && !text.contains("-") && !text.contains("/")) {
+                String d4 = m.group(1).trim();
+                String fitCode = m.group(2);
+                String sym4 = d4.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
+                        : findLeadingSymbolGroup(g, groups, i, processed);
+                // Fit tolerance implies diameter, EXCEPT when fit code starts with
+                // uppercase 'R' (which usually means Radius in engineering drawings)
+                if (sym4.isEmpty() && !fitCode.startsWith("R")) {
+                    sym4 = "\u03A6";
+                } else if (sym4.isEmpty()) {
+                    sym4 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
+                }
                 results.add(new DimensionResult(
-                        m.group(1).trim(), m.group(2), "dimension", g));
+                        sym4 + d4, fitCode, "dimension", g));
                 processed.add(i);
                 continue;
             }
@@ -146,7 +180,7 @@ public class DimensionIdentifier {
                     }
                     continue;
                 }
-
+            
                 String numPart = dim.replaceAll("[^\\d.]", "");
                 if (numPart.isEmpty()) continue;
                 double val;
@@ -157,12 +191,12 @@ public class DimensionIdentifier {
                 } catch (NumberFormatException e) {
                     continue;
                 }
-
+            
                 String nearbyTol = findNearbyTolerance(g, groups, i, usedAsTolerance);
-
+            
                 // toleranceOnly 模式下仅输出带公差的尺寸
                 if (toleranceOnly && nearbyTol == null) continue;
-                // 非 toleranceOnly 模式下，单字符无公差无Φ符号 → 输出完整文本（而非跳过）
+                // 非 toleranceOnly 模式下，单字符无公差无\u03A6符号 → 输出完整文本（而非跳过）
                 if (!toleranceOnly && nearbyTol == null && dim.length() == 1 && !hasNearbyPhiSymbol(g, groups)) {
                     if (text.length() > 1) {
                         results.add(new DimensionResult(text, null, "dimension", g));
@@ -170,9 +204,18 @@ public class DimensionIdentifier {
                     }
                     continue;
                 }
-
+            
+                // Check if a standalone engineering symbol (\u03a6, \u00d8, etc.) sits
+                // immediately to the LEFT of this number group.  On Windows, PDFBox
+                // often puts the symbol in a separate text run (separate writeString
+                // call) because the garbled \u00a1\u00a4 chars come from a different
+                // font than the number.  After correction the symbol is a standalone
+                // TextElement / TextGroup that TextGrouper may not merge because the
+                // Y delta or X gap exceeds its thresholds.  We handle it here instead.
+                String leadingSymbol = findLeadingSymbolGroup(g, groups, i, processed);
+            
                 // 区域模式下使用完整文本（含描述），toleranceOnly 模式只用尺寸值
-                String dimText = (!toleranceOnly && text.length() > dim.length() + 3) ? text : dim;
+                String dimText = (!toleranceOnly && text.length() > dim.length() + 3) ? text : leadingSymbol + dim;
                 results.add(new DimensionResult(dimText, nearbyTol, "dimension", g));
                 processed.add(i);
                 continue;
@@ -188,12 +231,114 @@ public class DimensionIdentifier {
         return results;
     }
 
+    /**
+     * Looks for a standalone engineering-symbol group (Φ, Ø, etc.) that sits
+     * immediately to the LEFT of {@code dimGroup} on the same page.  On Windows
+     * PDFBox often produces these as separate writeString calls because the
+     * garbled char sequence (e.g. ¡¤) comes from a different font / text-run
+     * than the following numeric text.  TextGrouper may fail to merge them due to
+     * Y-delta or X-gap thresholds, so we handle the combination here.
+     *
+     * @return the symbol string (e.g. "Φ") if found, otherwise empty string
+     */
+    private String findLeadingSymbolGroup(TextGroup dimGroup, List<TextGroup> allGroups,
+                                          int dimIndex, Set<Integer> processed) {
+        Set<String> symbols = Set.of(
+                "\u03A6", "\u03C6",   // Phi / phi - diameter
+                "\u00D8", "\u00F8",   // O-stroke uppercase / lowercase (\u00d8 / \u00f8)
+                "\u2205",             // empty-set - sometimes used for diameter
+                "\u2300"              // \u2300 - ISO/DIN standard diameter sign (most CAD software)
+        );
+        // Generous thresholds: the symbol (e.g. \u03a6 at fontSize=24) and the number
+        // (e.g. "280" at fontSize=9) come from different fonts/text-runs, so their
+        // X/Y coordinates may differ more than expected.
+        //
+        // IMPORTANT: we use the symbol's LEFT EDGE (other.getX()), NOT the right edge,
+        // because the symbol TextElement width is computed from the original garbled
+        // chars (e.g. \u00a1\u00a4 from BerlinSansFB-Bold) whose advance widths are
+        // larger than the visual \u03a6 glyph.  This inflated width makes
+        // (symbolRight - dimGroup.getX()) negative even when the symbol visually
+        // precedes the number.
+        //
+        // gapFromSymLeft = dimGroup.getX() - other.getX()
+        //   > 0  : symbol starts LEFT of the number (correct for "\u03a6NN")
+        //   = ~0  : symbol almost aligns with number start
+        //   < -3  : symbol starts to the RIGHT of the number (wrong, reject)
+        float maxGapFromLeft = Math.max(dimGroup.getFontSize() * 5.0f, 70f);
+        float maxYDelta       = Math.max(dimGroup.getFontSize() * 3.0f, 40f);
+    
+        String best     = null;
+        float  bestDist = Float.MAX_VALUE;
+        int    bestIdx  = -1;
+    
+        for (int j = 0; j < allGroups.size(); j++) {
+            if (j == dimIndex || processed.contains(j)) continue;
+            TextGroup other = allGroups.get(j);
+            if (other.getPageNum() != dimGroup.getPageNum()) continue;
+            String t = other.getText().trim();
+            if (!symbols.contains(t)) continue;
+    
+            float symbolRight    = other.getX() + Math.max(other.getWidth(), 1f);
+            float gapFromRight   = dimGroup.getX() - symbolRight;   // kept for diagnostics
+            float gapFromSymLeft = dimGroup.getX() - other.getX();  // reliable: not affected by inflated width
+            float dy             = Math.abs(other.getY() - dimGroup.getY());
+
+            // Compute seqNum for diagnostic logging
+            int phiSeq = other.getElements().isEmpty() ? -1 :
+                    other.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
+            int dimSeq = dimGroup.getElements().isEmpty() ? -1 :
+                    dimGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
+
+            // Rendering-order filter: in AutoCAD PDFs, Φ symbols are rendered in an
+            // early layer and dimension numbers in a much later layer.
+            // Valid pairs have dimSeq - phiSeq >= 180.
+            // This rejects same-layer false positives (e.g. "6" with seqDiff=7)
+            // and cross-annotation false positives (e.g. "160" with seqDiff=173).
+            if (phiSeq >= 0 && dimSeq >= 0 && (dimSeq - phiSeq) < 180) {
+                continue;
+            }
+
+            if (gapFromSymLeft >= -10f && gapFromSymLeft < maxGapFromLeft && dy < maxYDelta) {
+                if (gapFromSymLeft < bestDist) {
+                    bestDist = gapFromSymLeft;
+                    best     = t;
+                    bestIdx  = j;
+                }
+            }
+        }
+    
+        if (best != null) {
+            TextGroup bestGroup = allGroups.get(bestIdx);
+            int bPhiSeq = bestGroup.getElements().isEmpty() ? -1 :
+                    bestGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
+            int bDimSeq = dimGroup.getElements().isEmpty() ? -1 :
+                    dimGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
+            log.info("findLeadingSymbolGroup: MATCHED [{}] for dim [{}] gapFromSymLeft={} dy={} "
+                    + "phiSeq={} dimSeq={} seqDiff={} phiFs={} dimFs={} "
+                    + "phiXY=({},{}) dimXY=({},{})",
+                    best, dimGroup.getText().trim(),
+                    String.format("%.1f", bestDist),
+                    String.format("%.1f", Math.abs(bestGroup.getY() - dimGroup.getY())),
+                    bPhiSeq, bDimSeq,
+                    (bPhiSeq >= 0 && bDimSeq >= 0) ? Math.abs(bPhiSeq - bDimSeq) : -1,
+                    String.format("%.1f", bestGroup.getFontSize()),
+                    String.format("%.1f", dimGroup.getFontSize()),
+                    String.format("%.1f", bestGroup.getX()),
+                    String.format("%.1f", bestGroup.getY()),
+                    String.format("%.1f", dimGroup.getX()),
+                    String.format("%.1f", dimGroup.getY()));
+            processed.add(bestIdx);
+            return best;
+        }
+        return "";
+    }
+
     private boolean hasNearbyPhiSymbol(TextGroup dimGroup, List<TextGroup> allGroups) {
         float searchDist = dimGroup.getFontSize() * 3.0f;
         for (TextGroup other : allGroups) {
             if (other.getPageNum() != dimGroup.getPageNum()) continue;
             String t = other.getText().trim();
-            if (!t.equals("¡¤") && !t.equals("Φ") && !t.equals("φ") && !t.equals("Ø") && !t.equals("∅"))
+            if (!t.equals("\u00A1\u00A4") && !t.equals("\u03A6") && !t.equals("\u03C6") && !t.equals("\u00D8") && !t.equals("\u2205"))
                 continue;
             float dx = Math.abs(other.getX() - dimGroup.getX());
             float dy = Math.abs(other.getY() - dimGroup.getY());
@@ -202,6 +347,43 @@ public class DimensionIdentifier {
         return false;
     }
 
+    /**
+     * Heuristic: infer that a dimension is a diameter based on context.
+     * In engineering drawings, fit tolerance codes (H7, m6, r6, etc.) are ONLY
+     * used for cylindrical features (shafts/holes). If the dimension text itself
+     * contains a fit code, or a nearby TextGroup on the same page contains one,
+     * we can safely infer it's a diameter.
+     *
+     * Fit tolerance pattern: single letter [A-Ha-h, j-z, J-N, P-Z] + 1-2 digits
+     * (excludes 'I/i' which is not used in ISO fit system)
+     */
+    private static final Pattern PAT_FIT_CODE = Pattern.compile(
+            "(?:^|\\s)([A-Ha-hJ-Nj-zP-Z])(\\d{1,2})(?:\\s|$)");
+
+    private boolean inferDiameterFromContext(TextGroup dimGroup, List<TextGroup> allGroups, String normalizedText) {
+        // 1) Check if the dimension text itself contains a fit code
+        if (PAT_FIT_CODE.matcher(normalizedText).find()) {
+            return true;
+        }
+
+        // 2) Check nearby TextGroups for fit tolerance codes
+        float searchX = 100f;
+        float searchY = 50f;
+        for (TextGroup other : allGroups) {
+            if (other == dimGroup) continue;
+            if (other.getPageNum() != dimGroup.getPageNum()) continue;
+            float dx = Math.abs(other.getX() - dimGroup.getX());
+            float dy = Math.abs(other.getY() - dimGroup.getY());
+            if (dx > searchX || dy > searchY) continue;
+            String ot = other.getText().trim();
+            // Fit code TextGroups are typically short (2-3 chars like "H7", "m6")
+            if (ot.length() >= 2 && ot.length() <= 3 && ot.matches("[A-Ha-hJ-Nj-zP-Z]\\d{1,2}")) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     private String findNearbyTolerance(TextGroup dimGroup, List<TextGroup> allGroups,
                                         int dimIndex, Set<Integer> usedAsTolerance) {
         float effWidth = dimGroup.getWidth() > 0 ? dimGroup.getWidth()
@@ -258,7 +440,7 @@ public class DimensionIdentifier {
 
             if (TitleBlockFilter.isGdtTolerance(otherText) && !bareSmallDecimal) continue;
 
-            if (other.getFontSize() <= dimGroup.getFontSize() * 0.9 || otherText.contains("±") || bareSmallDecimal) {
+            if (other.getFontSize() <= dimGroup.getFontSize() * 0.9 || otherText.contains("\u00B1") || bareSmallDecimal) {
 
                 Matcher cm = PAT_COMPOUND_TOL.matcher(otherText);
                 if (cm.matches()) {
@@ -286,7 +468,7 @@ public class DimensionIdentifier {
                 }
 
                 if (bareSmallDecimal) {
-                    tolParts.add("±" + otherText);
+                    tolParts.add("\u00B1" + otherText);
                     tolIndices.add(i);
                     continue;
                 }
@@ -296,8 +478,8 @@ public class DimensionIdentifier {
         if (!tolParts.isEmpty()) {
             usedAsTolerance.addAll(tolIndices);
             tolParts.sort((a, b) -> {
-                boolean aPos = a.startsWith("+") || a.startsWith("±");
-                boolean bPos = b.startsWith("+") || b.startsWith("±");
+                boolean aPos = a.startsWith("+") || a.startsWith("\u00B1");
+                boolean bPos = b.startsWith("+") || b.startsWith("\u00B1");
                 return Boolean.compare(bPos, aPos);
             });
             return String.join(" / ", tolParts);
diff --git a/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java b/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java
index bd67a1b2..33662682 100644
--- a/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java
+++ b/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java
@@ -1,23 +1,37 @@
 package extraction;
 
-import lombok.Getter;
 import model.TextElement;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
 public class PositionedTextStripper extends PDFTextStripper {
-    @Getter
+
+    private static final Logger log = LoggerFactory.getLogger(PositionedTextStripper.class);
+
     private final List<TextElement> elements = new ArrayList<>();
+    private int seqCounter = 0;
     private int currentPage = 0;
     private float currentPageWidth = 0;
     private float currentPageHeight = 0;
 
+    public PositionedTextStripper() throws IOException {
+        super();
+    }
+
+//    @Override
+//    protected void processTextPosition(TextPosition text) {
+//        log.info("processTextPosition: {},unicode={},codes={},font={},embedded={},damaged={}"
+//                , text, text.getUnicode(), text.getCharacterCodes(), text.getFont().getName(), text.getFont().isEmbedded(), text.getFont().isDamaged());
+//    }
+
     @Override
     protected void startPage(PDPage page) throws IOException {
         currentPage++;
@@ -33,12 +47,38 @@ public class PositionedTextStripper extends PDFTextStripper {
             super.writeString(text, textPositions);
             return;
         }
+//        log.info("writeString: {}", text);
+
+        // Rebuild text from individual TextPositions and apply garbled-pattern corrections.
+        StringBuilder sb = new StringBuilder();
+        for (TextPosition tp : textPositions) {
+            String u = tp.getUnicode();
+            if (u != null) {
+                sb.append(u);
+            }
+        }
+        String correctedText = correctGarbledText(sb.toString());
+
+        // Log original vs corrected text at INFO level for diagnostics
+//        log.info("writeString: raw=[{}] corrected=[{}]", text, correctedText);
+
+        // Debug: log each TextPosition's unicode with code points AND font name
+        if (log.isDebugEnabled()) {
+            for (TextPosition tp : textPositions) {
+                String u = tp.getUnicode();
+                if (u != null && !u.isEmpty() && u.charAt(0) > 0x7F) {
+                    String fn = tp.getFont() != null ? tp.getFont().getName() : "null";
+                    log.debug("TextPos: unicode={} codePoints={} fontSize={} font={}",
+                            u, toCodePoints(u), String.format("%.1f", tp.getFontSizeInPt()), fn);
+                }
+            }
+        }
 
         TextPosition first = textPositions.get(0);
         TextPosition last = textPositions.get(textPositions.size() - 1);
 
         TextElement elem = new TextElement();
-        elem.setText(text.trim());
+        elem.setText(correctedText.trim());
         elem.setPageNum(currentPage);
         elem.setX(first.getX());
         elem.setY(first.getY());
@@ -49,10 +89,65 @@ public class PositionedTextStripper extends PDFTextStripper {
         elem.setPageHeight(currentPageHeight);
 
         if (!elem.getText().isEmpty()) {
+            elem.setSeqNum(seqCounter++);
             elements.add(elem);
         }
 
         super.writeString(text, textPositions);
     }
 
+    /**
+     * Correct common garbled text patterns produced by PDFBox on Windows.
+     * <p>
+     * When a PDF uses Symbol / engineering fonts, the glyph codes are
+     * sometimes misinterpreted through the platform's default charset
+     * (GBK on Chinese Windows, Latin-1 on Western Windows), producing
+     * garbled character pairs or individual garbled chars.
+     * <p>
+     * Pair replacements are tried first; if the pair is split across
+     * separate writeString() calls, individual char fallbacks apply.
+     */
+    private String correctGarbledText(String text) {
+        if (text == null || text.isEmpty()) return text;
+
+        String result = text
+                // ---- Pair replacements (both chars in same writeString call) ----
+                .replace("\u00a1\u00a4", "\u03a6")   // ¡¤ -> Φ (diameter)
+                .replace("\u00a1\u00e3", "\u00b0")   // ¡ã -> ° (degree)
+                .replace("\u00a1\u00c0", "\u00b1")   // ¡À -> ± (plus-minus)
+                .replace("\u00a6\u00b5", "\u03a6")   // ¦µ -> Φ (variant)
+                .replace("\u00a1\u00c1", "\u00b1")   // ¡Á -> ± (variant)
+                .replace("\uffc3n\uffc3", "\u03a6")   // Mac garbled Φ
+                .replace("\uffc3$\uffc3", "\u03a6")   // Mac garbled Φ
+                .replace("\ufffdn\ufffd", "\u00d8")   // Mac garbled Ø
+                // ---- Fallback: individual char replacements ----
+                // When ¡¤ is split across separate writeString() calls,
+                // each char appears alone.  Standalone ¡ -> Φ, ¤ -> removed.
+                .replace("\u00a1", "\u03a6")     // ¡ -> Φ
+                .replace("\u00a4", "")           // ¤ -> removed
+                // ---- Remove Unicode REPLACEMENT CHARACTER ----
+                .replace("\ufffd", "");
+
+        // Diagnostic log at INFO level when a correction was made
+        if (!result.equals(text)) {
+            log.info("correctGarbledText: [{}] -> [{}]", toCodePoints(text), toCodePoints(result));
+        }
+
+        return result;
+    }
+
+    private static String toCodePoints(String s) {
+        if (s == null) return "null";
+        StringBuilder sb = new StringBuilder("[");
+        for (int i = 0; i < s.codePointCount(0, s.length()); i++) {
+            if (i > 0) sb.append(" ");
+            sb.append(String.format("U+%04X", s.codePointAt(s.offsetByCodePoints(0, i))));
+        }
+        sb.append("]");
+        return sb.toString();
+    }
+
+    public List<TextElement> getElements() {
+        return elements;
+    }
 }
diff --git a/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java b/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java
index 4bea69b4..b9f001ed 100644
--- a/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java
+++ b/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java
@@ -3,13 +3,22 @@ package extraction;
 import model.TextElement;
 import model.TextGroup;
 
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
+import java.util.*;
 
 public class TextGrouper {
 
+    /** Short engineering symbols that should merge with adjacent elements regardless of fontSize ratio. */
+    private static final Set<String> ENGINEERING_SYMBOLS = Set.of(
+            "\u03A6", "\u03C6",   // \u03a6 / \u03c6 (Phi/phi - diameter)
+            "\u00D8", "\u00F8",   // \u00d8 / \u00f8 (O-stroke - diameter)
+            "\u2205",             // \u2205 (empty set - diameter)
+            "\u2300",             // \u2300 (⌀ - ISO/DIN standard diameter sign)
+            "\u25A1", "\u25FB", "\u25AD",  // square cross-section symbols
+            "\u00B1",             // \u00b1 (plus-minus)
+            "\u00B0",             // \u00b0 (degree)
+            "\u00D7"              // \u00d7 (multiply)
+    );
+
     public List<TextGroup> groupTextElements(List<TextElement> elements) {
         if (elements.isEmpty()) return Collections.emptyList();
 
@@ -32,7 +41,14 @@ public class TextGrouper {
 
                 float maxFs = Math.max(current.getFontSize(), elem.getFontSize());
                 float minFs = Math.min(current.getFontSize(), elem.getFontSize());
-                if (maxFs > 0 && minFs / maxFs < 0.7f) {
+                // Allow engineering symbols (\u03a6, \u00d8, \u00b1, etc.) to merge
+                // with adjacent elements regardless of fontSize ratio.  These symbols
+                // are often rendered at a much larger fontSize than the dimension
+                // numbers they belong to, but they must be in the same TextGroup for
+                // the DimensionIdentifier regex patterns to match (e.g. "\u03a650\u00b10.1").
+                boolean isSymbol = isShortEngineeringSymbol(current.getText())
+                        || isShortEngineeringSymbol(elem.getText());
+                if (maxFs > 0 && minFs / maxFs < 0.7f && !isSymbol) {
                     merge = false;
                 } else {
                     boolean curEndsWithDigit = !current.getText().isEmpty()
@@ -83,4 +99,14 @@ public class TextGrouper {
 
         return groups;
     }
+
+    /**
+     * Check if the text is a short engineering symbol (1-2 chars) that should
+     * be allowed to merge with adjacent elements regardless of fontSize ratio.
+     */
+    private boolean isShortEngineeringSymbol(String text) {
+        if (text == null || text.isEmpty()) return false;
+        String trimmed = text.trim();
+        return trimmed.length() <= 2 && ENGINEERING_SYMBOLS.contains(trimmed);
+    }
 }
diff --git a/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java b/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java
index e6ffac66..4d159353 100644
--- a/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java
+++ b/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java
@@ -1,16 +1,48 @@
 package extraction;
 
+
+import java.text.Normalizer;
+
 public class TextNormalizer {
 
     public static String normalizeText(String text) {
-        return text
-                .replace("\u00a1\u00a4", "\u03a6")
-                .replace("\u00a1\u00e3", "\u00b0")
-                .replace("\u00a1\u00c0", "\u00b1")
-                .replace("\u00a6\u00b5", "\u03a6")
-                .replace("\uffc3n\uffc3", "\u03a6")
-                .replace("\uffc3$\uffc3", "\u03a6")
-                .replace("\ufffdn\ufffd", "\u00d8")
-                .replace("\ufffd", "");
+        if (text == null || text.isEmpty()) return text;
+
+        String result = text
+                // --- Mac-specific garbled sequences (original) ---
+                .replace("\u00a1\u00a4", "\u03a6")   // \u00a1\u00a4 (\u00a1\u00a4) -> \u03a6 (\u03a6)
+                .replace("\u00a1\u00e3", "\u00b0")   // \u00a1\u00e3 (\u00a1\u00e3) -> \u00b0 (\u00b0)
+                .replace("\u00a1\u00c0", "\u00b1")   // \u00a1\u00c0 (\u00a1\u00c0) -> \u00b1 (\u00b1)
+                .replace("\u00a6\u00b5", "\u03a6")   // \u00a6\u00b5 (\u00a6\u00b5) -> \u03a6 (\u03a6)
+                .replace("\uffc3n\uffc3", "\u03a6") // \uffc3n\uffc3 -> \u03a6
+                .replace("\uffc3$\uffc3", "\u03a6") // \uffc3$\uffc3 -> \u03a6
+                .replace("\ufffdn\ufffd", "\u00d8") // \ufffdn\ufffd -> \u00d8
+
+                // --- Windows / GBK-specific garbled sequences ---
+                // GBK encodes CJK chars as 2-byte sequences; when a PDF glyph
+                // code is misread through GBK, it produces double-byte CJK chars
+                // or Latin-1 pairs that differ from the Mac variants.
+                .replace("\u03a6\u0080", "\u03a6")   // trailing control char after Phi
+                .replace("\u00d8\u0080", "\u00d8")   // trailing control char after O-stroke
+                .replace("\u00b1\u0080", "\u00b1")   // trailing control char after plus-minus
+
+                // Windows-1252 / GBK mis-read of Symbol font glyph codes
+                .replace("\u0086", "\u2020")   // dagger (Symbol 0x86)
+                .replace("\u0087", "\u2021")   // double dagger (Symbol 0x87)
+                .replace("\u0089", "\u2030")   // per mille (Symbol 0x89)
+
+                // Common GBK garbled pairs for engineering symbols
+                .replace("\u00c6\u00f8", "\u00d8")   // \u00c6\u00f8 -> \u00d8
+                .replace("\u00a1\u00c1", "\u00b1")   // variant of \u00a1\u00c0
+
+                // --- Remove remaining garbage ---
+                .replace("\ufffd", "");            // Unicode replacement char
+
+        // NFC normalization: canonical decomposition then recomposition
+        // This standardizes characters that have multiple Unicode representations
+        // (e.g., U+00C5 vs U+0041 U+030A for Å)
+        result = Normalizer.normalize(result, Normalizer.Form.NFC);
+
+        return result;
     }
 }
diff --git a/nflg-qms-pdf-extract/src/main/java/model/TextElement.java b/nflg-qms-pdf-extract/src/main/java/model/TextElement.java
index b94de722..c1a9c4e4 100644
--- a/nflg-qms-pdf-extract/src/main/java/model/TextElement.java
+++ b/nflg-qms-pdf-extract/src/main/java/model/TextElement.java
@@ -4,10 +4,13 @@ import lombok.Data;
 
 @Data
 public class TextElement {
+
     private String text;
     private int pageNum;
     private float x, y;
     private float width, height;
     private float fontSize;
     private float pageWidth, pageHeight;
+    private int seqNum;
+
 }

From 6ebacd0b9b8512a2603b8f1094c24190264e39e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9B=B9=E9=B9=8F=E9=A3=9E?= <rakor2010@gmail.com>
Date: Tue, 12 May 2026 08:07:32 +0800
Subject: [PATCH 2/2] =?UTF-8?q?docs(pdf-extraction):=20=E7=BC=96=E5=86=99P?=
 =?UTF-8?q?DF=E5=B0=BA=E5=AF=B8=E6=A0=87=E6=B3=A8=E6=8F=90=E5=8F=96?=
 =?UTF-8?q?=E6=96=B9=E6=A1=88=E6=96=87=E6=A1=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 详细说明项目概述及技术栈，包括Java、Spring Boot和PDFBox 3.x
- 描述直径符号Φ的识别方案，区分文字形式和矢量图形两种情况
- 介绍乱码字符映射及文本提取管线流程
- 提供正则模式匹配优先级及关键文件职责说明
- 明确启发式推断规则及已知局限，指导用户手动校正矢量Φ符号情况
- 包含乱码映射表及配合公差识别规则，提升提取精度和可维护性
---
 nflg-qms-pdf-extract/pdf数据提取方案.md | 132 ++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 nflg-qms-pdf-extract/pdf数据提取方案.md

diff --git a/nflg-qms-pdf-extract/pdf数据提取方案.md b/nflg-qms-pdf-extract/pdf数据提取方案.md
new file mode 100644
index 00000000..bdecc11a
--- /dev/null
+++ b/nflg-qms-pdf-extract/pdf数据提取方案.md
@@ -0,0 +1,132 @@
+# PDF 数据提取方案
+
+## 一、项目概述
+
+本项目从工程图纸 PDF 中提取尺寸标注数据（数值 + 公差 + 直径符号Φ），基于 Java + Spring Boot + PDFBox 3.x。
+
+---
+
+## 二、Φ（直径）符号识别方案
+
+### 2.1 问题背景
+
+工程图纸中，直径标注以 Φ 符号前缀表示（如 Φ280 ±0.15）。不同 PDF 生成方式导致 Φ 符号在文本层的表现完全不同：
+
+| PDF 类型 | Φ 的存在方式 | 可提取性 |
+|---------|-------------|---------|
+| AutoCAD 导出 PDF（类型A） | 文字字符（BerlinSansFB-Bold 字体），独立 TextGroup | ✅ 可提取 |
+| 其他 CAD 导出 PDF（类型B） | 矢量图形路径（画的圆+斜线） | ❌ 不可提取 |
+
+### 2.2 类型A：文字形式 Φ 的提取
+
+**流程：**
+1. `PositionedTextStripper` 提取文本，乱码字符（如 `¡¤`/U+00A1 U+00A4）通过 `correctGarbledText()` 映射为 Φ（U+03A6）
+2. `TextGrouper` 将相邻文本元素分组，但 Φ 和数字因字体/位置差异通常分属不同 TextGroup
+3. `DimensionIdentifier.findLeadingSymbolGroup()` 在维度数字 TextGroup 的左侧搜索独立的 Φ TextGroup 并拼接
+
+**关键参数：**
+- 符号集合：`Φ`(U+03A6), `φ`(U+03C6), `Ø`(U+00D8), `ø`(U+00F8), `∅`(U+2205), `⌀`(U+2300)
+- 空间搜索范围：`gapFromSymLeft ∈ [-10, maxGapFromLeft)`，`dy < maxYDelta`
+  - `maxGapFromLeft = max(dimFontSize × 5.0, 70)`
+  - `maxYDelta = max(dimFontSize × 3.0, 40)`
+- 渲染顺序过滤（seqDiff）：`dimSeq - phiSeq >= 180`
+  - AutoCAD PDF 中 Φ 符号在早期渲染层（seqNum ≈ 33-37），数字在后期层（seqNum ≈ 228-239）
+  - 真正配对的 seqDiff = 195~205
+  - 误配 "160" 的 seqDiff = 173，误配 "6" 的 seqDiff = 6~7
+  - 阈值 180 可完美分离
+
+### 2.3 类型B：矢量图形 Φ 的启发式推断
+
+当 PDF 中 Φ 是图形路径时（整个页面无任何 Φ 文本字符），采用配合公差启发式规则：
+
+**规则1 — 配合公差直接推断：**
+- 文本含配合公差代号（如 `230 m6 +0.017`）→ 自动加 Φ
+- 排除：大写 R 开头的代号（如 `15R4`），因为 R+数字 通常表示圆角半径而非配合公差
+
+**规则2 — 附近配合公差 TextGroup：**
+- 搜索范围：dx < 100, dy < 50（像素坐标）
+- 附近存在 2-3 字符的配合代号文本（如独立的 "H7" TextGroup）→ 加 Φ
+- 示例：`280 ±0.15` 附近有 `H7`(dx=56, dy=14) → 识别为 Φ280
+
+**已知局限：**
+- 当 Φ 是矢量图形且附近无配合公差代号时（如 `Φ55 ±0.15`），无法自动识别
+- 此类情况需用户在提取结果中手动添加 Φ 前缀
+
+---
+
+## 三、文本提取管线
+
+```
+PDF文件
+  ↓
+PositionedTextStripper.writeString()    ← 提取每次文本绘制调用
+  │  ├─ 重建 Unicode 文本（TextPosition.getUnicode()）
+  │  ├─ correctGarbledText()：修复乱码映射
+  │  └─ 生成 TextElement（含坐标、字号、seqNum）
+  ↓
+TextGrouper.group()                     ← 按空间邻近性分组
+  │  └─ 生成 TextGroup 列表
+  ↓
+DimensionIdentifier.identifyDimensions()  ← 正则匹配 + 符号拼接
+  │  ├─ TextNormalizer.normalizeText()：二次乱码修复
+  │  ├─ 正则模式匹配（公差/配合/螺纹/纯数值）
+  │  ├─ findLeadingSymbolGroup()：搜索独立Φ TextGroup
+  │  ├─ inferDiameterFromContext()：配合公差启发式
+  │  └─ findNearbyTolerance()：搜索附近公差文本
+  ↓
+DimensionResult 列表               ← 最终输出
+```
+
+---
+
+## 四、乱码映射表
+
+### 4.1 correctGarbledText()（PositionedTextStripper）
+
+| 原始乱码 | 映射结果 | 说明 |
+|---------|---------|------|
+| `¡¤` (U+00A1 U+00A4) | Φ (U+03A6) | BerlinSansFB-Bold 字体直径符号 |
+| `¡ã` (U+00A1 U+00E3) | ° (U+00B0) | 度数符号 |
+| `¡À` (U+00A1 U+00C0) | ± (U+00B1) | 正负公差 |
+| `¦µ` (U+00A6 U+00B5) | Φ (U+03A6) | 直径符号变体 |
+| `¡Á` (U+00A1 U+00C1) | ± (U+00B1) | 正负公差变体 |
+| 单独 `¡` (U+00A1) | Φ (U+03A6) | ¡¤ 跨 writeString 拆分时 |
+| 单独 `¤` (U+00A4) | 删除 | ¡¤ 跨 writeString 拆分时 |
+| U+FFFD | 删除 | Unicode 替换字符 |
+
+### 4.2 TextNormalizer.normalizeText()
+
+与 correctGarbledText 类似的映射，加上：
+- `Φ\u0080` → `Φ`（Windows GBK 尾随控制字符）
+- `Ø\u0080` → `Ø`
+- `±\u0080` → `±`
+- `Æø` (U+00C6 U+00F8) → `Ø`
+- NFC 规范化
+
+---
+
+## 五、正则模式优先级
+
+`identifyDimensions()` 中的匹配顺序（先匹配先处理）：
+
+1. **PAT_COMPOUND_TOL** — 复合公差文本（如 `+0.03 0`）→ 跳过（不作为尺寸）
+2. **PAT_DIM_SYM_TOL** — 对称公差（如 `280 ±0.15`）
+3. **PAT_DIM_ASYM_TOL** — 非对称公差/斜线（如 `55 +0.03/-0.02`）
+4. **PAT_DIM_LIMIT_TOL** — 非对称公差/空格（如 `55 +0.03 -0.02`）
+5. **PAT_DIM_FIT** — 配合公差（如 `230 m6`、`280 H7`）
+6. **PAT_THREAD** — 螺纹标注（如 `M24×1.5`）
+7. **PAT_PLAIN_DIM** — 纯尺寸数值（如 `270`、`R15`）
+
+---
+
+## 六、关键文件
+
+| 文件 | 职责 |
+|-----|------|
+| `PositionedTextStripper.java` | PDFBox 文本提取、乱码修复、seqNum 赋值 |
+| `TextGrouper.java` | 按空间邻近性将 TextElement 分组为 TextGroup |
+| `TextNormalizer.java` | 二次乱码映射、NFC 规范化 |
+| `DimensionIdentifier.java` | 尺寸识别核心逻辑（正则 + Φ拼接 + 启发式） |
+| `TitleBlockFilter.java` | 标题栏区域过滤、GD&T/粗糙度排除 |
+| `RegionFilterService.java` | 区域框选模式尺寸提取 |
+| `PdfExtractionService.java` | 提取服务入口 |