From f67ccb5796ed0ff1a076fbacfaae4771a3053d67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9B=B9=E9=B9=8F=E9=A3=9E?= Date: Mon, 11 May 2026 19:03:24 +0800 Subject: [PATCH 1/2] =?UTF-8?q?refactor(extraction):=20=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=B0=BA=E5=AF=B8=E8=AF=86=E5=88=AB=E4=B8=8E=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E9=A2=84=E5=A4=84=E7=90=86=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 使用统一Unicode字符替代多种编码符号,提升尺寸与公差识别的准确性 - 增加自动推断直径符号Φ的启发式方法,支持根据配合公差上下文自动标记 - 添加查找前导符号函数,处理分开文本元素中的工程符号与数字合并 - 引入文本序号seqNum,用于排序过滤和错误匹配排除 - 在文本合并逻辑中允许工程符号忽略字体大小差异合并文本单元 - 新增文本纠错函数,修正PDF符号字体乱码,提升解析文本质量 - 完善公差识别与尺寸字符串拼接的内部逻辑,统一±符号为Unicode编码 - 为文本元素添加seqNum属性,支持序号管理与日志打印 - 采纳文本归一化处理,做NFC标准化,修正编码混乱文本 - 杜绝无用的日志打印,保留必要信息用于调试与后续支持 --- .../java/extraction/DimensionIdentifier.java | 230 ++++++++++++++++-- .../extraction/PositionedTextStripper.java | 101 +++++++- .../src/main/java/extraction/TextGrouper.java | 36 ++- .../main/java/extraction/TextNormalizer.java | 50 +++- .../src/main/java/model/TextElement.java | 3 + 5 files changed, 379 insertions(+), 41 deletions(-) diff --git a/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java b/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java index 649cebce..3183b96a 100644 --- a/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java +++ b/nflg-qms-pdf-extract/src/main/java/extraction/DimensionIdentifier.java @@ -1,8 +1,11 @@ package extraction; - +import lombok.extern.slf4j.Slf4j; import model.DimensionResult; +import model.TextElement; import model.TextGroup; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.HashSet; @@ -11,31 +14,32 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +@Slf4j public class DimensionIdentifier { // 尺寸 + 对称公差 private static final Pattern PAT_DIM_SYM_TOL = Pattern.compile( - "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*[±]\\s*(\\d+\\.?\\d*)"); + "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*[\u00B1]\\s*(\\d+\\.?\\d*)"); // 尺寸 + 非对称公差(斜线分隔) private static final Pattern PAT_DIM_ASYM_TOL = Pattern.compile( - "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([+-]\\d+\\.?\\d*)\\s*/\\s*([+-]\\d+\\.?\\d*)"); + "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*([+-]\\d+\\.?\\d*)\\s*/\\s*([+-]\\d+\\.?\\d*)"); // 尺寸 + 非对称公差(空格分隔) private static final Pattern PAT_DIM_LIMIT_TOL = Pattern.compile( - "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)"); + "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)"); // 尺寸 + 配合公差代号 private static final Pattern PAT_DIM_FIT = Pattern.compile( - "([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([A-HJ-Zj-z]\\d{1,2})\\b"); + "([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*([A-HJ-Zj-z]\\d{1,2})\\b"); // 螺纹标注(M型公制螺纹 + G型管螺纹 + Rc/NPT等) private static final Pattern PAT_THREAD = Pattern.compile( - "(M\\d+\\.?\\d*(?:\\s*[xX×]\\s*\\d+\\.?\\d*)?|(?:G|Rc|Rp|NPT)\\s*\\d+(?:/\\d+)?)"); + "(M\\d+\\.?\\d*(?:\\s*[xX\u00D7]\\s*\\d+\\.?\\d*)?|(?:G|Rc|Rp|NPT)\\s*\\d+(?:/\\d+)?)"); // 独立公差文本 private static final Pattern PAT_TOLERANCE = Pattern.compile( - "[±]\\s*\\d+\\.?\\d*|[+-]\\s*\\d+\\.?\\d*"); + "[\u00B1]\\s*\\d+\\.?\\d*|[+-]\\s*\\d+\\.?\\d*"); // 复合公差文本 private static final Pattern PAT_COMPOUND_TOL = Pattern.compile( @@ -43,7 +47,7 @@ public class DimensionIdentifier { // 纯尺寸数值 private static final Pattern PAT_PLAIN_DIM = Pattern.compile( - "([ΦφØ∅]\\s*\\d+\\.?\\d*|[Rr]\\d+\\.?\\d*|[Cc]\\d+\\.?\\d*|\\d+\\.\\d+|\\d+)"); + "([\u03A6\u03C6\u00D8\u2205\u2300]\\s*\\d+\\.?\\d*|[Rr]\\d+\\.?\\d*|[Cc]\\d+\\.?\\d*|\\d+\\.\\d+|\\d+)"); public List identifyDimensions(List groups) { return identifyDimensions(groups, true); @@ -74,7 +78,7 @@ public class DimensionIdentifier { if (TitleBlockFilter.isToleranceOnly(text)) continue; if (TitleBlockFilter.isSurfaceRoughness(text)) continue; if (TitleBlockFilter.isGdtTolerance(text)) continue; - if (text.matches(".*\\d+\\.\\d+\\d+\\.\\d+.*") && !text.contains("±") && !text.contains("/")) continue; + if (text.matches(".*\\d+\\.\\d+\\d+\\.\\d+.*") && !text.contains("\u00B1") && !text.contains("/")) continue; } // === 区域模式(!toleranceOnly):不做内容过滤,直接进入模式匹配 === @@ -90,8 +94,15 @@ public class DimensionIdentifier { // 1) 对称公差 m = PAT_DIM_SYM_TOL.matcher(text); if (m.find()) { + String d1 = m.group(1).trim(); + String sym1 = d1.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? "" + : findLeadingSymbolGroup(g, groups, i, processed); + // Heuristic: if no text-based Φ found, infer from fit tolerance context + if (sym1.isEmpty()) { + sym1 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : ""; + } results.add(new DimensionResult( - m.group(1).trim(), "±" + m.group(2), "dimension", g)); + sym1 + d1, "\u00B1" + m.group(2), "dimension", g)); processed.add(i); continue; } @@ -99,8 +110,14 @@ public class DimensionIdentifier { // 2) 非对称公差(斜线) m = PAT_DIM_ASYM_TOL.matcher(text); if (m.find()) { + String d2 = m.group(1).trim(); + String sym2 = d2.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? "" + : findLeadingSymbolGroup(g, groups, i, processed); + if (sym2.isEmpty()) { + sym2 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : ""; + } results.add(new DimensionResult( - m.group(1).trim(), m.group(2) + "/" + m.group(3), "dimension", g)); + sym2 + d2, m.group(2) + "/" + m.group(3), "dimension", g)); processed.add(i); continue; } @@ -108,8 +125,14 @@ public class DimensionIdentifier { // 3) 非对称公差(空格) m = PAT_DIM_LIMIT_TOL.matcher(text); if (m.find()) { + String d3 = m.group(1).trim(); + String sym3 = d3.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? "" + : findLeadingSymbolGroup(g, groups, i, processed); + if (sym3.isEmpty()) { + sym3 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : ""; + } results.add(new DimensionResult( - m.group(1).trim(), m.group(2) + " " + m.group(3), "dimension", g)); + sym3 + d3, m.group(2) + " " + m.group(3), "dimension", g)); processed.add(i); continue; } @@ -117,8 +140,19 @@ public class DimensionIdentifier { // 4) 配合公差 m = PAT_DIM_FIT.matcher(text); if (m.find() && !text.contains("-") && !text.contains("/")) { + String d4 = m.group(1).trim(); + String fitCode = m.group(2); + String sym4 = d4.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? "" + : findLeadingSymbolGroup(g, groups, i, processed); + // Fit tolerance implies diameter, EXCEPT when fit code starts with + // uppercase 'R' (which usually means Radius in engineering drawings) + if (sym4.isEmpty() && !fitCode.startsWith("R")) { + sym4 = "\u03A6"; + } else if (sym4.isEmpty()) { + sym4 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : ""; + } results.add(new DimensionResult( - m.group(1).trim(), m.group(2), "dimension", g)); + sym4 + d4, fitCode, "dimension", g)); processed.add(i); continue; } @@ -146,7 +180,7 @@ public class DimensionIdentifier { } continue; } - + String numPart = dim.replaceAll("[^\\d.]", ""); if (numPart.isEmpty()) continue; double val; @@ -157,12 +191,12 @@ public class DimensionIdentifier { } catch (NumberFormatException e) { continue; } - + String nearbyTol = findNearbyTolerance(g, groups, i, usedAsTolerance); - + // toleranceOnly 模式下仅输出带公差的尺寸 if (toleranceOnly && nearbyTol == null) continue; - // 非 toleranceOnly 模式下,单字符无公差无Φ符号 → 输出完整文本(而非跳过) + // 非 toleranceOnly 模式下,单字符无公差无\u03A6符号 → 输出完整文本(而非跳过) if (!toleranceOnly && nearbyTol == null && dim.length() == 1 && !hasNearbyPhiSymbol(g, groups)) { if (text.length() > 1) { results.add(new DimensionResult(text, null, "dimension", g)); @@ -170,9 +204,18 @@ public class DimensionIdentifier { } continue; } - + + // Check if a standalone engineering symbol (\u03a6, \u00d8, etc.) sits + // immediately to the LEFT of this number group. On Windows, PDFBox + // often puts the symbol in a separate text run (separate writeString + // call) because the garbled \u00a1\u00a4 chars come from a different + // font than the number. After correction the symbol is a standalone + // TextElement / TextGroup that TextGrouper may not merge because the + // Y delta or X gap exceeds its thresholds. We handle it here instead. + String leadingSymbol = findLeadingSymbolGroup(g, groups, i, processed); + // 区域模式下使用完整文本(含描述),toleranceOnly 模式只用尺寸值 - String dimText = (!toleranceOnly && text.length() > dim.length() + 3) ? text : dim; + String dimText = (!toleranceOnly && text.length() > dim.length() + 3) ? text : leadingSymbol + dim; results.add(new DimensionResult(dimText, nearbyTol, "dimension", g)); processed.add(i); continue; @@ -188,12 +231,114 @@ public class DimensionIdentifier { return results; } + /** + * Looks for a standalone engineering-symbol group (Φ, Ø, etc.) that sits + * immediately to the LEFT of {@code dimGroup} on the same page. On Windows + * PDFBox often produces these as separate writeString calls because the + * garbled char sequence (e.g. ¡¤) comes from a different font / text-run + * than the following numeric text. TextGrouper may fail to merge them due to + * Y-delta or X-gap thresholds, so we handle the combination here. + * + * @return the symbol string (e.g. "Φ") if found, otherwise empty string + */ + private String findLeadingSymbolGroup(TextGroup dimGroup, List allGroups, + int dimIndex, Set processed) { + Set symbols = Set.of( + "\u03A6", "\u03C6", // Phi / phi - diameter + "\u00D8", "\u00F8", // O-stroke uppercase / lowercase (\u00d8 / \u00f8) + "\u2205", // empty-set - sometimes used for diameter + "\u2300" // \u2300 - ISO/DIN standard diameter sign (most CAD software) + ); + // Generous thresholds: the symbol (e.g. \u03a6 at fontSize=24) and the number + // (e.g. "280" at fontSize=9) come from different fonts/text-runs, so their + // X/Y coordinates may differ more than expected. + // + // IMPORTANT: we use the symbol's LEFT EDGE (other.getX()), NOT the right edge, + // because the symbol TextElement width is computed from the original garbled + // chars (e.g. \u00a1\u00a4 from BerlinSansFB-Bold) whose advance widths are + // larger than the visual \u03a6 glyph. This inflated width makes + // (symbolRight - dimGroup.getX()) negative even when the symbol visually + // precedes the number. + // + // gapFromSymLeft = dimGroup.getX() - other.getX() + // > 0 : symbol starts LEFT of the number (correct for "\u03a6NN") + // = ~0 : symbol almost aligns with number start + // < -3 : symbol starts to the RIGHT of the number (wrong, reject) + float maxGapFromLeft = Math.max(dimGroup.getFontSize() * 5.0f, 70f); + float maxYDelta = Math.max(dimGroup.getFontSize() * 3.0f, 40f); + + String best = null; + float bestDist = Float.MAX_VALUE; + int bestIdx = -1; + + for (int j = 0; j < allGroups.size(); j++) { + if (j == dimIndex || processed.contains(j)) continue; + TextGroup other = allGroups.get(j); + if (other.getPageNum() != dimGroup.getPageNum()) continue; + String t = other.getText().trim(); + if (!symbols.contains(t)) continue; + + float symbolRight = other.getX() + Math.max(other.getWidth(), 1f); + float gapFromRight = dimGroup.getX() - symbolRight; // kept for diagnostics + float gapFromSymLeft = dimGroup.getX() - other.getX(); // reliable: not affected by inflated width + float dy = Math.abs(other.getY() - dimGroup.getY()); + + // Compute seqNum for diagnostic logging + int phiSeq = other.getElements().isEmpty() ? -1 : + other.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1); + int dimSeq = dimGroup.getElements().isEmpty() ? -1 : + dimGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1); + + // Rendering-order filter: in AutoCAD PDFs, Φ symbols are rendered in an + // early layer and dimension numbers in a much later layer. + // Valid pairs have dimSeq - phiSeq >= 180. + // This rejects same-layer false positives (e.g. "6" with seqDiff=7) + // and cross-annotation false positives (e.g. "160" with seqDiff=173). + if (phiSeq >= 0 && dimSeq >= 0 && (dimSeq - phiSeq) < 180) { + continue; + } + + if (gapFromSymLeft >= -10f && gapFromSymLeft < maxGapFromLeft && dy < maxYDelta) { + if (gapFromSymLeft < bestDist) { + bestDist = gapFromSymLeft; + best = t; + bestIdx = j; + } + } + } + + if (best != null) { + TextGroup bestGroup = allGroups.get(bestIdx); + int bPhiSeq = bestGroup.getElements().isEmpty() ? -1 : + bestGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1); + int bDimSeq = dimGroup.getElements().isEmpty() ? -1 : + dimGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1); + log.info("findLeadingSymbolGroup: MATCHED [{}] for dim [{}] gapFromSymLeft={} dy={} " + + "phiSeq={} dimSeq={} seqDiff={} phiFs={} dimFs={} " + + "phiXY=({},{}) dimXY=({},{})", + best, dimGroup.getText().trim(), + String.format("%.1f", bestDist), + String.format("%.1f", Math.abs(bestGroup.getY() - dimGroup.getY())), + bPhiSeq, bDimSeq, + (bPhiSeq >= 0 && bDimSeq >= 0) ? Math.abs(bPhiSeq - bDimSeq) : -1, + String.format("%.1f", bestGroup.getFontSize()), + String.format("%.1f", dimGroup.getFontSize()), + String.format("%.1f", bestGroup.getX()), + String.format("%.1f", bestGroup.getY()), + String.format("%.1f", dimGroup.getX()), + String.format("%.1f", dimGroup.getY())); + processed.add(bestIdx); + return best; + } + return ""; + } + private boolean hasNearbyPhiSymbol(TextGroup dimGroup, List allGroups) { float searchDist = dimGroup.getFontSize() * 3.0f; for (TextGroup other : allGroups) { if (other.getPageNum() != dimGroup.getPageNum()) continue; String t = other.getText().trim(); - if (!t.equals("¡¤") && !t.equals("Φ") && !t.equals("φ") && !t.equals("Ø") && !t.equals("∅")) + if (!t.equals("\u00A1\u00A4") && !t.equals("\u03A6") && !t.equals("\u03C6") && !t.equals("\u00D8") && !t.equals("\u2205")) continue; float dx = Math.abs(other.getX() - dimGroup.getX()); float dy = Math.abs(other.getY() - dimGroup.getY()); @@ -202,6 +347,43 @@ public class DimensionIdentifier { return false; } + /** + * Heuristic: infer that a dimension is a diameter based on context. + * In engineering drawings, fit tolerance codes (H7, m6, r6, etc.) are ONLY + * used for cylindrical features (shafts/holes). If the dimension text itself + * contains a fit code, or a nearby TextGroup on the same page contains one, + * we can safely infer it's a diameter. + * + * Fit tolerance pattern: single letter [A-Ha-h, j-z, J-N, P-Z] + 1-2 digits + * (excludes 'I/i' which is not used in ISO fit system) + */ + private static final Pattern PAT_FIT_CODE = Pattern.compile( + "(?:^|\\s)([A-Ha-hJ-Nj-zP-Z])(\\d{1,2})(?:\\s|$)"); + + private boolean inferDiameterFromContext(TextGroup dimGroup, List allGroups, String normalizedText) { + // 1) Check if the dimension text itself contains a fit code + if (PAT_FIT_CODE.matcher(normalizedText).find()) { + return true; + } + + // 2) Check nearby TextGroups for fit tolerance codes + float searchX = 100f; + float searchY = 50f; + for (TextGroup other : allGroups) { + if (other == dimGroup) continue; + if (other.getPageNum() != dimGroup.getPageNum()) continue; + float dx = Math.abs(other.getX() - dimGroup.getX()); + float dy = Math.abs(other.getY() - dimGroup.getY()); + if (dx > searchX || dy > searchY) continue; + String ot = other.getText().trim(); + // Fit code TextGroups are typically short (2-3 chars like "H7", "m6") + if (ot.length() >= 2 && ot.length() <= 3 && ot.matches("[A-Ha-hJ-Nj-zP-Z]\\d{1,2}")) { + return true; + } + } + return false; + } + private String findNearbyTolerance(TextGroup dimGroup, List allGroups, int dimIndex, Set usedAsTolerance) { float effWidth = dimGroup.getWidth() > 0 ? dimGroup.getWidth() @@ -258,7 +440,7 @@ public class DimensionIdentifier { if (TitleBlockFilter.isGdtTolerance(otherText) && !bareSmallDecimal) continue; - if (other.getFontSize() <= dimGroup.getFontSize() * 0.9 || otherText.contains("±") || bareSmallDecimal) { + if (other.getFontSize() <= dimGroup.getFontSize() * 0.9 || otherText.contains("\u00B1") || bareSmallDecimal) { Matcher cm = PAT_COMPOUND_TOL.matcher(otherText); if (cm.matches()) { @@ -286,7 +468,7 @@ public class DimensionIdentifier { } if (bareSmallDecimal) { - tolParts.add("±" + otherText); + tolParts.add("\u00B1" + otherText); tolIndices.add(i); continue; } @@ -296,8 +478,8 @@ public class DimensionIdentifier { if (!tolParts.isEmpty()) { usedAsTolerance.addAll(tolIndices); tolParts.sort((a, b) -> { - boolean aPos = a.startsWith("+") || a.startsWith("±"); - boolean bPos = b.startsWith("+") || b.startsWith("±"); + boolean aPos = a.startsWith("+") || a.startsWith("\u00B1"); + boolean bPos = b.startsWith("+") || b.startsWith("\u00B1"); return Boolean.compare(bPos, aPos); }); return String.join(" / ", tolParts); diff --git a/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java b/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java index bd67a1b2..33662682 100644 --- a/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java +++ b/nflg-qms-pdf-extract/src/main/java/extraction/PositionedTextStripper.java @@ -1,23 +1,37 @@ package extraction; -import lombok.Getter; import model.TextElement; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class PositionedTextStripper extends PDFTextStripper { - @Getter + + private static final Logger log = LoggerFactory.getLogger(PositionedTextStripper.class); + private final List elements = new ArrayList<>(); + private int seqCounter = 0; private int currentPage = 0; private float currentPageWidth = 0; private float currentPageHeight = 0; + public PositionedTextStripper() throws IOException { + super(); + } + +// @Override +// protected void processTextPosition(TextPosition text) { +// log.info("processTextPosition: {},unicode={},codes={},font={},embedded={},damaged={}" +// , text, text.getUnicode(), text.getCharacterCodes(), text.getFont().getName(), text.getFont().isEmbedded(), text.getFont().isDamaged()); +// } + @Override protected void startPage(PDPage page) throws IOException { currentPage++; @@ -33,12 +47,38 @@ public class PositionedTextStripper extends PDFTextStripper { super.writeString(text, textPositions); return; } +// log.info("writeString: {}", text); + + // Rebuild text from individual TextPositions and apply garbled-pattern corrections. + StringBuilder sb = new StringBuilder(); + for (TextPosition tp : textPositions) { + String u = tp.getUnicode(); + if (u != null) { + sb.append(u); + } + } + String correctedText = correctGarbledText(sb.toString()); + + // Log original vs corrected text at INFO level for diagnostics +// log.info("writeString: raw=[{}] corrected=[{}]", text, correctedText); + + // Debug: log each TextPosition's unicode with code points AND font name + if (log.isDebugEnabled()) { + for (TextPosition tp : textPositions) { + String u = tp.getUnicode(); + if (u != null && !u.isEmpty() && u.charAt(0) > 0x7F) { + String fn = tp.getFont() != null ? tp.getFont().getName() : "null"; + log.debug("TextPos: unicode={} codePoints={} fontSize={} font={}", + u, toCodePoints(u), String.format("%.1f", tp.getFontSizeInPt()), fn); + } + } + } TextPosition first = textPositions.get(0); TextPosition last = textPositions.get(textPositions.size() - 1); TextElement elem = new TextElement(); - elem.setText(text.trim()); + elem.setText(correctedText.trim()); elem.setPageNum(currentPage); elem.setX(first.getX()); elem.setY(first.getY()); @@ -49,10 +89,65 @@ public class PositionedTextStripper extends PDFTextStripper { elem.setPageHeight(currentPageHeight); if (!elem.getText().isEmpty()) { + elem.setSeqNum(seqCounter++); elements.add(elem); } super.writeString(text, textPositions); } + /** + * Correct common garbled text patterns produced by PDFBox on Windows. + *

+ * When a PDF uses Symbol / engineering fonts, the glyph codes are + * sometimes misinterpreted through the platform's default charset + * (GBK on Chinese Windows, Latin-1 on Western Windows), producing + * garbled character pairs or individual garbled chars. + *

+ * Pair replacements are tried first; if the pair is split across + * separate writeString() calls, individual char fallbacks apply. + */ + private String correctGarbledText(String text) { + if (text == null || text.isEmpty()) return text; + + String result = text + // ---- Pair replacements (both chars in same writeString call) ---- + .replace("\u00a1\u00a4", "\u03a6") // ¡¤ -> Φ (diameter) + .replace("\u00a1\u00e3", "\u00b0") // ¡ã -> ° (degree) + .replace("\u00a1\u00c0", "\u00b1") // ¡À -> ± (plus-minus) + .replace("\u00a6\u00b5", "\u03a6") // ¦µ -> Φ (variant) + .replace("\u00a1\u00c1", "\u00b1") // ¡Á -> ± (variant) + .replace("\uffc3n\uffc3", "\u03a6") // Mac garbled Φ + .replace("\uffc3$\uffc3", "\u03a6") // Mac garbled Φ + .replace("\ufffdn\ufffd", "\u00d8") // Mac garbled Ø + // ---- Fallback: individual char replacements ---- + // When ¡¤ is split across separate writeString() calls, + // each char appears alone. Standalone ¡ -> Φ, ¤ -> removed. + .replace("\u00a1", "\u03a6") // ¡ -> Φ + .replace("\u00a4", "") // ¤ -> removed + // ---- Remove Unicode REPLACEMENT CHARACTER ---- + .replace("\ufffd", ""); + + // Diagnostic log at INFO level when a correction was made + if (!result.equals(text)) { + log.info("correctGarbledText: [{}] -> [{}]", toCodePoints(text), toCodePoints(result)); + } + + return result; + } + + private static String toCodePoints(String s) { + if (s == null) return "null"; + StringBuilder sb = new StringBuilder("["); + for (int i = 0; i < s.codePointCount(0, s.length()); i++) { + if (i > 0) sb.append(" "); + sb.append(String.format("U+%04X", s.codePointAt(s.offsetByCodePoints(0, i)))); + } + sb.append("]"); + return sb.toString(); + } + + public List getElements() { + return elements; + } } diff --git a/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java b/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java index 4bea69b4..b9f001ed 100644 --- a/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java +++ b/nflg-qms-pdf-extract/src/main/java/extraction/TextGrouper.java @@ -3,13 +3,22 @@ package extraction; import model.TextElement; import model.TextGroup; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; +import java.util.*; public class TextGrouper { + /** Short engineering symbols that should merge with adjacent elements regardless of fontSize ratio. */ + private static final Set ENGINEERING_SYMBOLS = Set.of( + "\u03A6", "\u03C6", // \u03a6 / \u03c6 (Phi/phi - diameter) + "\u00D8", "\u00F8", // \u00d8 / \u00f8 (O-stroke - diameter) + "\u2205", // \u2205 (empty set - diameter) + "\u2300", // \u2300 (⌀ - ISO/DIN standard diameter sign) + "\u25A1", "\u25FB", "\u25AD", // square cross-section symbols + "\u00B1", // \u00b1 (plus-minus) + "\u00B0", // \u00b0 (degree) + "\u00D7" // \u00d7 (multiply) + ); + public List groupTextElements(List elements) { if (elements.isEmpty()) return Collections.emptyList(); @@ -32,7 +41,14 @@ public class TextGrouper { float maxFs = Math.max(current.getFontSize(), elem.getFontSize()); float minFs = Math.min(current.getFontSize(), elem.getFontSize()); - if (maxFs > 0 && minFs / maxFs < 0.7f) { + // Allow engineering symbols (\u03a6, \u00d8, \u00b1, etc.) to merge + // with adjacent elements regardless of fontSize ratio. These symbols + // are often rendered at a much larger fontSize than the dimension + // numbers they belong to, but they must be in the same TextGroup for + // the DimensionIdentifier regex patterns to match (e.g. "\u03a650\u00b10.1"). + boolean isSymbol = isShortEngineeringSymbol(current.getText()) + || isShortEngineeringSymbol(elem.getText()); + if (maxFs > 0 && minFs / maxFs < 0.7f && !isSymbol) { merge = false; } else { boolean curEndsWithDigit = !current.getText().isEmpty() @@ -83,4 +99,14 @@ public class TextGrouper { return groups; } + + /** + * Check if the text is a short engineering symbol (1-2 chars) that should + * be allowed to merge with adjacent elements regardless of fontSize ratio. + */ + private boolean isShortEngineeringSymbol(String text) { + if (text == null || text.isEmpty()) return false; + String trimmed = text.trim(); + return trimmed.length() <= 2 && ENGINEERING_SYMBOLS.contains(trimmed); + } } diff --git a/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java b/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java index e6ffac66..4d159353 100644 --- a/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java +++ b/nflg-qms-pdf-extract/src/main/java/extraction/TextNormalizer.java @@ -1,16 +1,48 @@ package extraction; + +import java.text.Normalizer; + public class TextNormalizer { public static String normalizeText(String text) { - return text - .replace("\u00a1\u00a4", "\u03a6") - .replace("\u00a1\u00e3", "\u00b0") - .replace("\u00a1\u00c0", "\u00b1") - .replace("\u00a6\u00b5", "\u03a6") - .replace("\uffc3n\uffc3", "\u03a6") - .replace("\uffc3$\uffc3", "\u03a6") - .replace("\ufffdn\ufffd", "\u00d8") - .replace("\ufffd", ""); + if (text == null || text.isEmpty()) return text; + + String result = text + // --- Mac-specific garbled sequences (original) --- + .replace("\u00a1\u00a4", "\u03a6") // \u00a1\u00a4 (\u00a1\u00a4) -> \u03a6 (\u03a6) + .replace("\u00a1\u00e3", "\u00b0") // \u00a1\u00e3 (\u00a1\u00e3) -> \u00b0 (\u00b0) + .replace("\u00a1\u00c0", "\u00b1") // \u00a1\u00c0 (\u00a1\u00c0) -> \u00b1 (\u00b1) + .replace("\u00a6\u00b5", "\u03a6") // \u00a6\u00b5 (\u00a6\u00b5) -> \u03a6 (\u03a6) + .replace("\uffc3n\uffc3", "\u03a6") // \uffc3n\uffc3 -> \u03a6 + .replace("\uffc3$\uffc3", "\u03a6") // \uffc3$\uffc3 -> \u03a6 + .replace("\ufffdn\ufffd", "\u00d8") // \ufffdn\ufffd -> \u00d8 + + // --- Windows / GBK-specific garbled sequences --- + // GBK encodes CJK chars as 2-byte sequences; when a PDF glyph + // code is misread through GBK, it produces double-byte CJK chars + // or Latin-1 pairs that differ from the Mac variants. + .replace("\u03a6\u0080", "\u03a6") // trailing control char after Phi + .replace("\u00d8\u0080", "\u00d8") // trailing control char after O-stroke + .replace("\u00b1\u0080", "\u00b1") // trailing control char after plus-minus + + // Windows-1252 / GBK mis-read of Symbol font glyph codes + .replace("\u0086", "\u2020") // dagger (Symbol 0x86) + .replace("\u0087", "\u2021") // double dagger (Symbol 0x87) + .replace("\u0089", "\u2030") // per mille (Symbol 0x89) + + // Common GBK garbled pairs for engineering symbols + .replace("\u00c6\u00f8", "\u00d8") // \u00c6\u00f8 -> \u00d8 + .replace("\u00a1\u00c1", "\u00b1") // variant of \u00a1\u00c0 + + // --- Remove remaining garbage --- + .replace("\ufffd", ""); // Unicode replacement char + + // NFC normalization: canonical decomposition then recomposition + // This standardizes characters that have multiple Unicode representations + // (e.g., U+00C5 vs U+0041 U+030A for Å) + result = Normalizer.normalize(result, Normalizer.Form.NFC); + + return result; } } diff --git a/nflg-qms-pdf-extract/src/main/java/model/TextElement.java b/nflg-qms-pdf-extract/src/main/java/model/TextElement.java index b94de722..c1a9c4e4 100644 --- a/nflg-qms-pdf-extract/src/main/java/model/TextElement.java +++ b/nflg-qms-pdf-extract/src/main/java/model/TextElement.java @@ -4,10 +4,13 @@ import lombok.Data; @Data public class TextElement { + private String text; private int pageNum; private float x, y; private float width, height; private float fontSize; private float pageWidth, pageHeight; + private int seqNum; + } From 6ebacd0b9b8512a2603b8f1094c24190264e39e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9B=B9=E9=B9=8F=E9=A3=9E?= Date: Tue, 12 May 2026 08:07:32 +0800 Subject: [PATCH 2/2] =?UTF-8?q?docs(pdf-extraction):=20=E7=BC=96=E5=86=99P?= =?UTF-8?q?DF=E5=B0=BA=E5=AF=B8=E6=A0=87=E6=B3=A8=E6=8F=90=E5=8F=96?= =?UTF-8?q?=E6=96=B9=E6=A1=88=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 详细说明项目概述及技术栈,包括Java、Spring Boot和PDFBox 3.x - 描述直径符号Φ的识别方案,区分文字形式和矢量图形两种情况 - 介绍乱码字符映射及文本提取管线流程 - 提供正则模式匹配优先级及关键文件职责说明 - 明确启发式推断规则及已知局限,指导用户手动校正矢量Φ符号情况 - 包含乱码映射表及配合公差识别规则,提升提取精度和可维护性 --- nflg-qms-pdf-extract/pdf数据提取方案.md | 132 ++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 nflg-qms-pdf-extract/pdf数据提取方案.md diff --git a/nflg-qms-pdf-extract/pdf数据提取方案.md b/nflg-qms-pdf-extract/pdf数据提取方案.md new file mode 100644 index 00000000..bdecc11a --- /dev/null +++ b/nflg-qms-pdf-extract/pdf数据提取方案.md @@ -0,0 +1,132 @@ +# PDF 数据提取方案 + +## 一、项目概述 + +本项目从工程图纸 PDF 中提取尺寸标注数据(数值 + 公差 + 直径符号Φ),基于 Java + Spring Boot + PDFBox 3.x。 + +--- + +## 二、Φ(直径)符号识别方案 + +### 2.1 问题背景 + +工程图纸中,直径标注以 Φ 符号前缀表示(如 Φ280 ±0.15)。不同 PDF 生成方式导致 Φ 符号在文本层的表现完全不同: + +| PDF 类型 | Φ 的存在方式 | 可提取性 | +|---------|-------------|---------| +| AutoCAD 导出 PDF(类型A) | 文字字符(BerlinSansFB-Bold 字体),独立 TextGroup | ✅ 可提取 | +| 其他 CAD 导出 PDF(类型B) | 矢量图形路径(画的圆+斜线) | ❌ 不可提取 | + +### 2.2 类型A:文字形式 Φ 的提取 + +**流程:** +1. `PositionedTextStripper` 提取文本,乱码字符(如 `¡¤`/U+00A1 U+00A4)通过 `correctGarbledText()` 映射为 Φ(U+03A6) +2. `TextGrouper` 将相邻文本元素分组,但 Φ 和数字因字体/位置差异通常分属不同 TextGroup +3. `DimensionIdentifier.findLeadingSymbolGroup()` 在维度数字 TextGroup 的左侧搜索独立的 Φ TextGroup 并拼接 + +**关键参数:** +- 符号集合:`Φ`(U+03A6), `φ`(U+03C6), `Ø`(U+00D8), `ø`(U+00F8), `∅`(U+2205), `⌀`(U+2300) +- 空间搜索范围:`gapFromSymLeft ∈ [-10, maxGapFromLeft)`,`dy < maxYDelta` + - `maxGapFromLeft = max(dimFontSize × 5.0, 70)` + - `maxYDelta = max(dimFontSize × 3.0, 40)` +- 渲染顺序过滤(seqDiff):`dimSeq - phiSeq >= 180` + - AutoCAD PDF 中 Φ 符号在早期渲染层(seqNum ≈ 33-37),数字在后期层(seqNum ≈ 228-239) + - 真正配对的 seqDiff = 195~205 + - 误配 "160" 的 seqDiff = 173,误配 "6" 的 seqDiff = 6~7 + - 阈值 180 可完美分离 + +### 2.3 类型B:矢量图形 Φ 的启发式推断 + +当 PDF 中 Φ 是图形路径时(整个页面无任何 Φ 文本字符),采用配合公差启发式规则: + +**规则1 — 配合公差直接推断:** +- 文本含配合公差代号(如 `230 m6 +0.017`)→ 自动加 Φ +- 排除:大写 R 开头的代号(如 `15R4`),因为 R+数字 通常表示圆角半径而非配合公差 + +**规则2 — 附近配合公差 TextGroup:** +- 搜索范围:dx < 100, dy < 50(像素坐标) +- 附近存在 2-3 字符的配合代号文本(如独立的 "H7" TextGroup)→ 加 Φ +- 示例:`280 ±0.15` 附近有 `H7`(dx=56, dy=14) → 识别为 Φ280 + +**已知局限:** +- 当 Φ 是矢量图形且附近无配合公差代号时(如 `Φ55 ±0.15`),无法自动识别 +- 此类情况需用户在提取结果中手动添加 Φ 前缀 + +--- + +## 三、文本提取管线 + +``` +PDF文件 + ↓ +PositionedTextStripper.writeString() ← 提取每次文本绘制调用 + │ ├─ 重建 Unicode 文本(TextPosition.getUnicode()) + │ ├─ correctGarbledText():修复乱码映射 + │ └─ 生成 TextElement(含坐标、字号、seqNum) + ↓ +TextGrouper.group() ← 按空间邻近性分组 + │ └─ 生成 TextGroup 列表 + ↓ +DimensionIdentifier.identifyDimensions() ← 正则匹配 + 符号拼接 + │ ├─ TextNormalizer.normalizeText():二次乱码修复 + │ ├─ 正则模式匹配(公差/配合/螺纹/纯数值) + │ ├─ findLeadingSymbolGroup():搜索独立Φ TextGroup + │ ├─ inferDiameterFromContext():配合公差启发式 + │ └─ findNearbyTolerance():搜索附近公差文本 + ↓ +DimensionResult 列表 ← 最终输出 +``` + +--- + +## 四、乱码映射表 + +### 4.1 correctGarbledText()(PositionedTextStripper) + +| 原始乱码 | 映射结果 | 说明 | +|---------|---------|------| +| `¡¤` (U+00A1 U+00A4) | Φ (U+03A6) | BerlinSansFB-Bold 字体直径符号 | +| `¡ã` (U+00A1 U+00E3) | ° (U+00B0) | 度数符号 | +| `¡À` (U+00A1 U+00C0) | ± (U+00B1) | 正负公差 | +| `¦µ` (U+00A6 U+00B5) | Φ (U+03A6) | 直径符号变体 | +| `¡Á` (U+00A1 U+00C1) | ± (U+00B1) | 正负公差变体 | +| 单独 `¡` (U+00A1) | Φ (U+03A6) | ¡¤ 跨 writeString 拆分时 | +| 单独 `¤` (U+00A4) | 删除 | ¡¤ 跨 writeString 拆分时 | +| U+FFFD | 删除 | Unicode 替换字符 | + +### 4.2 TextNormalizer.normalizeText() + +与 correctGarbledText 类似的映射,加上: +- `Φ\u0080` → `Φ`(Windows GBK 尾随控制字符) +- `Ø\u0080` → `Ø` +- `±\u0080` → `±` +- `Æø` (U+00C6 U+00F8) → `Ø` +- NFC 规范化 + +--- + +## 五、正则模式优先级 + +`identifyDimensions()` 中的匹配顺序(先匹配先处理): + +1. **PAT_COMPOUND_TOL** — 复合公差文本(如 `+0.03 0`)→ 跳过(不作为尺寸) +2. **PAT_DIM_SYM_TOL** — 对称公差(如 `280 ±0.15`) +3. **PAT_DIM_ASYM_TOL** — 非对称公差/斜线(如 `55 +0.03/-0.02`) +4. **PAT_DIM_LIMIT_TOL** — 非对称公差/空格(如 `55 +0.03 -0.02`) +5. **PAT_DIM_FIT** — 配合公差(如 `230 m6`、`280 H7`) +6. **PAT_THREAD** — 螺纹标注(如 `M24×1.5`) +7. **PAT_PLAIN_DIM** — 纯尺寸数值(如 `270`、`R15`) + +--- + +## 六、关键文件 + +| 文件 | 职责 | +|-----|------| +| `PositionedTextStripper.java` | PDFBox 文本提取、乱码修复、seqNum 赋值 | +| `TextGrouper.java` | 按空间邻近性将 TextElement 分组为 TextGroup | +| `TextNormalizer.java` | 二次乱码映射、NFC 规范化 | +| `DimensionIdentifier.java` | 尺寸识别核心逻辑(正则 + Φ拼接 + 启发式) | +| `TitleBlockFilter.java` | 标题栏区域过滤、GD&T/粗糙度排除 | +| `RegionFilterService.java` | 区域框选模式尺寸提取 | +| `PdfExtractionService.java` | 提取服务入口 |