refactor(extraction): 优化尺寸识别与文本预处理逻辑

- 使用统一Unicode字符替代多种编码符号,提升尺寸与公差识别的准确性
- 增加自动推断直径符号Φ的启发式方法,支持根据配合公差上下文自动标记
- 添加查找前导符号函数,处理分开文本元素中的工程符号与数字合并
- 引入文本序号seqNum,用于排序过滤和错误匹配排除
- 在文本合并逻辑中允许工程符号忽略字体大小差异合并文本单元
- 新增文本纠错函数,修正PDF符号字体乱码,提升解析文本质量
- 完善公差识别与尺寸字符串拼接的内部逻辑,统一±符号为Unicode编码
- 为文本元素添加seqNum属性,支持序号管理与日志打印
- 采纳文本归一化处理,做NFC标准化,修正编码混乱文本
- 杜绝无用的日志打印,保留必要信息用于调试与后续支持
This commit is contained in:
曹鹏飞 2026-05-11 19:03:24 +08:00
parent eec0ad801e
commit f67ccb5796
5 changed files with 379 additions and 41 deletions

View File

@ -1,8 +1,11 @@
package extraction;
import lombok.extern.slf4j.Slf4j;
import model.DimensionResult;
import model.TextElement;
import model.TextGroup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.HashSet;
@ -11,31 +14,32 @@ import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Slf4j
public class DimensionIdentifier {
// 尺寸 + 对称公差
private static final Pattern PAT_DIM_SYM_TOL = Pattern.compile(
"([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*[±]\\s*(\\d+\\.?\\d*)");
"([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*[\u00B1]\\s*(\\d+\\.?\\d*)");
// 尺寸 + 非对称公差斜线分隔
private static final Pattern PAT_DIM_ASYM_TOL = Pattern.compile(
"([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([+-]\\d+\\.?\\d*)\\s*/\\s*([+-]\\d+\\.?\\d*)");
"([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*([+-]\\d+\\.?\\d*)\\s*/\\s*([+-]\\d+\\.?\\d*)");
// 尺寸 + 非对称公差空格分隔
private static final Pattern PAT_DIM_LIMIT_TOL = Pattern.compile(
"([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)");
"([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)");
// 尺寸 + 配合公差代号
private static final Pattern PAT_DIM_FIT = Pattern.compile(
"([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([A-HJ-Zj-z]\\d{1,2})\\b");
"([\u03A6\u03C6\u00D8\u2205\u2300]?\\s*\\d+\\.?\\d*)\\s*([A-HJ-Zj-z]\\d{1,2})\\b");
// 螺纹标注M型公制螺纹 + G型管螺纹 + Rc/NPT等
private static final Pattern PAT_THREAD = Pattern.compile(
"(M\\d+\\.?\\d*(?:\\s*[xX×]\\s*\\d+\\.?\\d*)?|(?:G|Rc|Rp|NPT)\\s*\\d+(?:/\\d+)?)");
"(M\\d+\\.?\\d*(?:\\s*[xX\u00D7]\\s*\\d+\\.?\\d*)?|(?:G|Rc|Rp|NPT)\\s*\\d+(?:/\\d+)?)");
// 独立公差文本
private static final Pattern PAT_TOLERANCE = Pattern.compile(
"[±]\\s*\\d+\\.?\\d*|[+-]\\s*\\d+\\.?\\d*");
"[\u00B1]\\s*\\d+\\.?\\d*|[+-]\\s*\\d+\\.?\\d*");
// 复合公差文本
private static final Pattern PAT_COMPOUND_TOL = Pattern.compile(
@ -43,7 +47,7 @@ public class DimensionIdentifier {
// 纯尺寸数值
private static final Pattern PAT_PLAIN_DIM = Pattern.compile(
"([ΦφØ∅]\\s*\\d+\\.?\\d*|[Rr]\\d+\\.?\\d*|[Cc]\\d+\\.?\\d*|\\d+\\.\\d+|\\d+)");
"([\u03A6\u03C6\u00D8\u2205\u2300]\\s*\\d+\\.?\\d*|[Rr]\\d+\\.?\\d*|[Cc]\\d+\\.?\\d*|\\d+\\.\\d+|\\d+)");
public List<DimensionResult> identifyDimensions(List<TextGroup> groups) {
return identifyDimensions(groups, true);
@ -74,7 +78,7 @@ public class DimensionIdentifier {
if (TitleBlockFilter.isToleranceOnly(text)) continue;
if (TitleBlockFilter.isSurfaceRoughness(text)) continue;
if (TitleBlockFilter.isGdtTolerance(text)) continue;
if (text.matches(".*\\d+\\.\\d+\\d+\\.\\d+.*") && !text.contains("±") && !text.contains("/")) continue;
if (text.matches(".*\\d+\\.\\d+\\d+\\.\\d+.*") && !text.contains("\u00B1") && !text.contains("/")) continue;
}
// === 区域模式!toleranceOnly不做内容过滤直接进入模式匹配 ===
@ -90,8 +94,15 @@ public class DimensionIdentifier {
// 1) 对称公差
m = PAT_DIM_SYM_TOL.matcher(text);
if (m.find()) {
String d1 = m.group(1).trim();
String sym1 = d1.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
: findLeadingSymbolGroup(g, groups, i, processed);
// Heuristic: if no text-based Φ found, infer from fit tolerance context
if (sym1.isEmpty()) {
sym1 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
}
results.add(new DimensionResult(
m.group(1).trim(), "±" + m.group(2), "dimension", g));
sym1 + d1, "\u00B1" + m.group(2), "dimension", g));
processed.add(i);
continue;
}
@ -99,8 +110,14 @@ public class DimensionIdentifier {
// 2) 非对称公差斜线
m = PAT_DIM_ASYM_TOL.matcher(text);
if (m.find()) {
String d2 = m.group(1).trim();
String sym2 = d2.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
: findLeadingSymbolGroup(g, groups, i, processed);
if (sym2.isEmpty()) {
sym2 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
}
results.add(new DimensionResult(
m.group(1).trim(), m.group(2) + "/" + m.group(3), "dimension", g));
sym2 + d2, m.group(2) + "/" + m.group(3), "dimension", g));
processed.add(i);
continue;
}
@ -108,8 +125,14 @@ public class DimensionIdentifier {
// 3) 非对称公差空格
m = PAT_DIM_LIMIT_TOL.matcher(text);
if (m.find()) {
String d3 = m.group(1).trim();
String sym3 = d3.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
: findLeadingSymbolGroup(g, groups, i, processed);
if (sym3.isEmpty()) {
sym3 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
}
results.add(new DimensionResult(
m.group(1).trim(), m.group(2) + " " + m.group(3), "dimension", g));
sym3 + d3, m.group(2) + " " + m.group(3), "dimension", g));
processed.add(i);
continue;
}
@ -117,8 +140,19 @@ public class DimensionIdentifier {
// 4) 配合公差
m = PAT_DIM_FIT.matcher(text);
if (m.find() && !text.contains("-") && !text.contains("/")) {
String d4 = m.group(1).trim();
String fitCode = m.group(2);
String sym4 = d4.matches("^[\u03A6\u03C6\u00D8\u00F8\u2205\u2300].*") ? ""
: findLeadingSymbolGroup(g, groups, i, processed);
// Fit tolerance implies diameter, EXCEPT when fit code starts with
// uppercase 'R' (which usually means Radius in engineering drawings)
if (sym4.isEmpty() && !fitCode.startsWith("R")) {
sym4 = "\u03A6";
} else if (sym4.isEmpty()) {
sym4 = inferDiameterFromContext(g, groups, text) ? "\u03A6" : "";
}
results.add(new DimensionResult(
m.group(1).trim(), m.group(2), "dimension", g));
sym4 + d4, fitCode, "dimension", g));
processed.add(i);
continue;
}
@ -146,7 +180,7 @@ public class DimensionIdentifier {
}
continue;
}
String numPart = dim.replaceAll("[^\\d.]", "");
if (numPart.isEmpty()) continue;
double val;
@ -157,12 +191,12 @@ public class DimensionIdentifier {
} catch (NumberFormatException e) {
continue;
}
String nearbyTol = findNearbyTolerance(g, groups, i, usedAsTolerance);
// toleranceOnly 模式下仅输出带公差的尺寸
if (toleranceOnly && nearbyTol == null) continue;
// toleranceOnly 模式下单字符无公差无Φ符号 输出完整文本而非跳过
// toleranceOnly 模式下单字符无公差无\u03A6符号 输出完整文本而非跳过
if (!toleranceOnly && nearbyTol == null && dim.length() == 1 && !hasNearbyPhiSymbol(g, groups)) {
if (text.length() > 1) {
results.add(new DimensionResult(text, null, "dimension", g));
@ -170,9 +204,18 @@ public class DimensionIdentifier {
}
continue;
}
// Check if a standalone engineering symbol (\u03a6, \u00d8, etc.) sits
// immediately to the LEFT of this number group. On Windows, PDFBox
// often puts the symbol in a separate text run (separate writeString
// call) because the garbled \u00a1\u00a4 chars come from a different
// font than the number. After correction the symbol is a standalone
// TextElement / TextGroup that TextGrouper may not merge because the
// Y delta or X gap exceeds its thresholds. We handle it here instead.
String leadingSymbol = findLeadingSymbolGroup(g, groups, i, processed);
// 区域模式下使用完整文本含描述toleranceOnly 模式只用尺寸值
String dimText = (!toleranceOnly && text.length() > dim.length() + 3) ? text : dim;
String dimText = (!toleranceOnly && text.length() > dim.length() + 3) ? text : leadingSymbol + dim;
results.add(new DimensionResult(dimText, nearbyTol, "dimension", g));
processed.add(i);
continue;
@ -188,12 +231,114 @@ public class DimensionIdentifier {
return results;
}
/**
* Looks for a standalone engineering-symbol group (Φ, Ø, etc.) that sits
* immediately to the LEFT of {@code dimGroup} on the same page. On Windows
* PDFBox often produces these as separate writeString calls because the
* garbled char sequence (e.g. ¡¤) comes from a different font / text-run
* than the following numeric text. TextGrouper may fail to merge them due to
* Y-delta or X-gap thresholds, so we handle the combination here.
*
* @return the symbol string (e.g. "Φ") if found, otherwise empty string
*/
private String findLeadingSymbolGroup(TextGroup dimGroup, List<TextGroup> allGroups,
int dimIndex, Set<Integer> processed) {
Set<String> symbols = Set.of(
"\u03A6", "\u03C6", // Phi / phi - diameter
"\u00D8", "\u00F8", // O-stroke uppercase / lowercase (\u00d8 / \u00f8)
"\u2205", // empty-set - sometimes used for diameter
"\u2300" // \u2300 - ISO/DIN standard diameter sign (most CAD software)
);
// Generous thresholds: the symbol (e.g. \u03a6 at fontSize=24) and the number
// (e.g. "280" at fontSize=9) come from different fonts/text-runs, so their
// X/Y coordinates may differ more than expected.
//
// IMPORTANT: we use the symbol's LEFT EDGE (other.getX()), NOT the right edge,
// because the symbol TextElement width is computed from the original garbled
// chars (e.g. \u00a1\u00a4 from BerlinSansFB-Bold) whose advance widths are
// larger than the visual \u03a6 glyph. This inflated width makes
// (symbolRight - dimGroup.getX()) negative even when the symbol visually
// precedes the number.
//
// gapFromSymLeft = dimGroup.getX() - other.getX()
// > 0 : symbol starts LEFT of the number (correct for "\u03a6NN")
// = ~0 : symbol almost aligns with number start
// < -3 : symbol starts to the RIGHT of the number (wrong, reject)
float maxGapFromLeft = Math.max(dimGroup.getFontSize() * 5.0f, 70f);
float maxYDelta = Math.max(dimGroup.getFontSize() * 3.0f, 40f);
String best = null;
float bestDist = Float.MAX_VALUE;
int bestIdx = -1;
for (int j = 0; j < allGroups.size(); j++) {
if (j == dimIndex || processed.contains(j)) continue;
TextGroup other = allGroups.get(j);
if (other.getPageNum() != dimGroup.getPageNum()) continue;
String t = other.getText().trim();
if (!symbols.contains(t)) continue;
float symbolRight = other.getX() + Math.max(other.getWidth(), 1f);
float gapFromRight = dimGroup.getX() - symbolRight; // kept for diagnostics
float gapFromSymLeft = dimGroup.getX() - other.getX(); // reliable: not affected by inflated width
float dy = Math.abs(other.getY() - dimGroup.getY());
// Compute seqNum for diagnostic logging
int phiSeq = other.getElements().isEmpty() ? -1 :
other.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
int dimSeq = dimGroup.getElements().isEmpty() ? -1 :
dimGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
// Rendering-order filter: in AutoCAD PDFs, Φ symbols are rendered in an
// early layer and dimension numbers in a much later layer.
// Valid pairs have dimSeq - phiSeq >= 180.
// This rejects same-layer false positives (e.g. "6" with seqDiff=7)
// and cross-annotation false positives (e.g. "160" with seqDiff=173).
if (phiSeq >= 0 && dimSeq >= 0 && (dimSeq - phiSeq) < 180) {
continue;
}
if (gapFromSymLeft >= -10f && gapFromSymLeft < maxGapFromLeft && dy < maxYDelta) {
if (gapFromSymLeft < bestDist) {
bestDist = gapFromSymLeft;
best = t;
bestIdx = j;
}
}
}
if (best != null) {
TextGroup bestGroup = allGroups.get(bestIdx);
int bPhiSeq = bestGroup.getElements().isEmpty() ? -1 :
bestGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
int bDimSeq = dimGroup.getElements().isEmpty() ? -1 :
dimGroup.getElements().stream().mapToInt(TextElement::getSeqNum).min().orElse(-1);
log.info("findLeadingSymbolGroup: MATCHED [{}] for dim [{}] gapFromSymLeft={} dy={} "
+ "phiSeq={} dimSeq={} seqDiff={} phiFs={} dimFs={} "
+ "phiXY=({},{}) dimXY=({},{})",
best, dimGroup.getText().trim(),
String.format("%.1f", bestDist),
String.format("%.1f", Math.abs(bestGroup.getY() - dimGroup.getY())),
bPhiSeq, bDimSeq,
(bPhiSeq >= 0 && bDimSeq >= 0) ? Math.abs(bPhiSeq - bDimSeq) : -1,
String.format("%.1f", bestGroup.getFontSize()),
String.format("%.1f", dimGroup.getFontSize()),
String.format("%.1f", bestGroup.getX()),
String.format("%.1f", bestGroup.getY()),
String.format("%.1f", dimGroup.getX()),
String.format("%.1f", dimGroup.getY()));
processed.add(bestIdx);
return best;
}
return "";
}
private boolean hasNearbyPhiSymbol(TextGroup dimGroup, List<TextGroup> allGroups) {
float searchDist = dimGroup.getFontSize() * 3.0f;
for (TextGroup other : allGroups) {
if (other.getPageNum() != dimGroup.getPageNum()) continue;
String t = other.getText().trim();
if (!t.equals("¡¤") && !t.equals("Φ") && !t.equals("φ") && !t.equals("Ø") && !t.equals(""))
if (!t.equals("\u00A1\u00A4") && !t.equals("\u03A6") && !t.equals("\u03C6") && !t.equals("\u00D8") && !t.equals("\u2205"))
continue;
float dx = Math.abs(other.getX() - dimGroup.getX());
float dy = Math.abs(other.getY() - dimGroup.getY());
@ -202,6 +347,43 @@ public class DimensionIdentifier {
return false;
}
/**
* Heuristic: infer that a dimension is a diameter based on context.
* In engineering drawings, fit tolerance codes (H7, m6, r6, etc.) are ONLY
* used for cylindrical features (shafts/holes). If the dimension text itself
* contains a fit code, or a nearby TextGroup on the same page contains one,
* we can safely infer it's a diameter.
*
* Fit tolerance pattern: single letter [A-Ha-h, j-z, J-N, P-Z] + 1-2 digits
* (excludes 'I/i' which is not used in ISO fit system)
*/
private static final Pattern PAT_FIT_CODE = Pattern.compile(
"(?:^|\\s)([A-Ha-hJ-Nj-zP-Z])(\\d{1,2})(?:\\s|$)");
private boolean inferDiameterFromContext(TextGroup dimGroup, List<TextGroup> allGroups, String normalizedText) {
// 1) Check if the dimension text itself contains a fit code
if (PAT_FIT_CODE.matcher(normalizedText).find()) {
return true;
}
// 2) Check nearby TextGroups for fit tolerance codes
float searchX = 100f;
float searchY = 50f;
for (TextGroup other : allGroups) {
if (other == dimGroup) continue;
if (other.getPageNum() != dimGroup.getPageNum()) continue;
float dx = Math.abs(other.getX() - dimGroup.getX());
float dy = Math.abs(other.getY() - dimGroup.getY());
if (dx > searchX || dy > searchY) continue;
String ot = other.getText().trim();
// Fit code TextGroups are typically short (2-3 chars like "H7", "m6")
if (ot.length() >= 2 && ot.length() <= 3 && ot.matches("[A-Ha-hJ-Nj-zP-Z]\\d{1,2}")) {
return true;
}
}
return false;
}
private String findNearbyTolerance(TextGroup dimGroup, List<TextGroup> allGroups,
int dimIndex, Set<Integer> usedAsTolerance) {
float effWidth = dimGroup.getWidth() > 0 ? dimGroup.getWidth()
@ -258,7 +440,7 @@ public class DimensionIdentifier {
if (TitleBlockFilter.isGdtTolerance(otherText) && !bareSmallDecimal) continue;
if (other.getFontSize() <= dimGroup.getFontSize() * 0.9 || otherText.contains("±") || bareSmallDecimal) {
if (other.getFontSize() <= dimGroup.getFontSize() * 0.9 || otherText.contains("\u00B1") || bareSmallDecimal) {
Matcher cm = PAT_COMPOUND_TOL.matcher(otherText);
if (cm.matches()) {
@ -286,7 +468,7 @@ public class DimensionIdentifier {
}
if (bareSmallDecimal) {
tolParts.add("±" + otherText);
tolParts.add("\u00B1" + otherText);
tolIndices.add(i);
continue;
}
@ -296,8 +478,8 @@ public class DimensionIdentifier {
if (!tolParts.isEmpty()) {
usedAsTolerance.addAll(tolIndices);
tolParts.sort((a, b) -> {
boolean aPos = a.startsWith("+") || a.startsWith("±");
boolean bPos = b.startsWith("+") || b.startsWith("±");
boolean aPos = a.startsWith("+") || a.startsWith("\u00B1");
boolean bPos = b.startsWith("+") || b.startsWith("\u00B1");
return Boolean.compare(bPos, aPos);
});
return String.join(" / ", tolParts);

View File

@ -1,23 +1,37 @@
package extraction;
import lombok.Getter;
import model.TextElement;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class PositionedTextStripper extends PDFTextStripper {
@Getter
private static final Logger log = LoggerFactory.getLogger(PositionedTextStripper.class);
private final List<TextElement> elements = new ArrayList<>();
private int seqCounter = 0;
private int currentPage = 0;
private float currentPageWidth = 0;
private float currentPageHeight = 0;
public PositionedTextStripper() throws IOException {
super();
}
// @Override
// protected void processTextPosition(TextPosition text) {
// log.info("processTextPosition: {},unicode={},codes={},font={},embedded={},damaged={}"
// , text, text.getUnicode(), text.getCharacterCodes(), text.getFont().getName(), text.getFont().isEmbedded(), text.getFont().isDamaged());
// }
@Override
protected void startPage(PDPage page) throws IOException {
currentPage++;
@ -33,12 +47,38 @@ public class PositionedTextStripper extends PDFTextStripper {
super.writeString(text, textPositions);
return;
}
// log.info("writeString: {}", text);
// Rebuild text from individual TextPositions and apply garbled-pattern corrections.
StringBuilder sb = new StringBuilder();
for (TextPosition tp : textPositions) {
String u = tp.getUnicode();
if (u != null) {
sb.append(u);
}
}
String correctedText = correctGarbledText(sb.toString());
// Log original vs corrected text at INFO level for diagnostics
// log.info("writeString: raw=[{}] corrected=[{}]", text, correctedText);
// Debug: log each TextPosition's unicode with code points AND font name
if (log.isDebugEnabled()) {
for (TextPosition tp : textPositions) {
String u = tp.getUnicode();
if (u != null && !u.isEmpty() && u.charAt(0) > 0x7F) {
String fn = tp.getFont() != null ? tp.getFont().getName() : "null";
log.debug("TextPos: unicode={} codePoints={} fontSize={} font={}",
u, toCodePoints(u), String.format("%.1f", tp.getFontSizeInPt()), fn);
}
}
}
TextPosition first = textPositions.get(0);
TextPosition last = textPositions.get(textPositions.size() - 1);
TextElement elem = new TextElement();
elem.setText(text.trim());
elem.setText(correctedText.trim());
elem.setPageNum(currentPage);
elem.setX(first.getX());
elem.setY(first.getY());
@ -49,10 +89,65 @@ public class PositionedTextStripper extends PDFTextStripper {
elem.setPageHeight(currentPageHeight);
if (!elem.getText().isEmpty()) {
elem.setSeqNum(seqCounter++);
elements.add(elem);
}
super.writeString(text, textPositions);
}
/**
* Correct common garbled text patterns produced by PDFBox on Windows.
* <p>
* When a PDF uses Symbol / engineering fonts, the glyph codes are
* sometimes misinterpreted through the platform's default charset
* (GBK on Chinese Windows, Latin-1 on Western Windows), producing
* garbled character pairs or individual garbled chars.
* <p>
* Pair replacements are tried first; if the pair is split across
* separate writeString() calls, individual char fallbacks apply.
*/
private String correctGarbledText(String text) {
if (text == null || text.isEmpty()) return text;
String result = text
// ---- Pair replacements (both chars in same writeString call) ----
.replace("\u00a1\u00a4", "\u03a6") // ¡¤ -> Φ (diameter)
.replace("\u00a1\u00e3", "\u00b0") // ¡ã -> ° (degree)
.replace("\u00a1\u00c0", "\u00b1") // ¡À -> ± (plus-minus)
.replace("\u00a6\u00b5", "\u03a6") // ¦µ -> Φ (variant)
.replace("\u00a1\u00c1", "\u00b1") // ¡Á -> ± (variant)
.replace("\uffc3n\uffc3", "\u03a6") // Mac garbled Φ
.replace("\uffc3$\uffc3", "\u03a6") // Mac garbled Φ
.replace("\ufffdn\ufffd", "\u00d8") // Mac garbled Ø
// ---- Fallback: individual char replacements ----
// When ¡¤ is split across separate writeString() calls,
// each char appears alone. Standalone ¡ -> Φ, ¤ -> removed.
.replace("\u00a1", "\u03a6") // ¡ -> Φ
.replace("\u00a4", "") // ¤ -> removed
// ---- Remove Unicode REPLACEMENT CHARACTER ----
.replace("\ufffd", "");
// Diagnostic log at INFO level when a correction was made
if (!result.equals(text)) {
log.info("correctGarbledText: [{}] -> [{}]", toCodePoints(text), toCodePoints(result));
}
return result;
}
private static String toCodePoints(String s) {
if (s == null) return "null";
StringBuilder sb = new StringBuilder("[");
for (int i = 0; i < s.codePointCount(0, s.length()); i++) {
if (i > 0) sb.append(" ");
sb.append(String.format("U+%04X", s.codePointAt(s.offsetByCodePoints(0, i))));
}
sb.append("]");
return sb.toString();
}
public List<TextElement> getElements() {
return elements;
}
}

View File

@ -3,13 +3,22 @@ package extraction;
import model.TextElement;
import model.TextGroup;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.*;
public class TextGrouper {
/** Short engineering symbols that should merge with adjacent elements regardless of fontSize ratio. */
private static final Set<String> ENGINEERING_SYMBOLS = Set.of(
"\u03A6", "\u03C6", // \u03a6 / \u03c6 (Phi/phi - diameter)
"\u00D8", "\u00F8", // \u00d8 / \u00f8 (O-stroke - diameter)
"\u2205", // \u2205 (empty set - diameter)
"\u2300", // \u2300 ( - ISO/DIN standard diameter sign)
"\u25A1", "\u25FB", "\u25AD", // square cross-section symbols
"\u00B1", // \u00b1 (plus-minus)
"\u00B0", // \u00b0 (degree)
"\u00D7" // \u00d7 (multiply)
);
public List<TextGroup> groupTextElements(List<TextElement> elements) {
if (elements.isEmpty()) return Collections.emptyList();
@ -32,7 +41,14 @@ public class TextGrouper {
float maxFs = Math.max(current.getFontSize(), elem.getFontSize());
float minFs = Math.min(current.getFontSize(), elem.getFontSize());
if (maxFs > 0 && minFs / maxFs < 0.7f) {
// Allow engineering symbols (\u03a6, \u00d8, \u00b1, etc.) to merge
// with adjacent elements regardless of fontSize ratio. These symbols
// are often rendered at a much larger fontSize than the dimension
// numbers they belong to, but they must be in the same TextGroup for
// the DimensionIdentifier regex patterns to match (e.g. "\u03a650\u00b10.1").
boolean isSymbol = isShortEngineeringSymbol(current.getText())
|| isShortEngineeringSymbol(elem.getText());
if (maxFs > 0 && minFs / maxFs < 0.7f && !isSymbol) {
merge = false;
} else {
boolean curEndsWithDigit = !current.getText().isEmpty()
@ -83,4 +99,14 @@ public class TextGrouper {
return groups;
}
/**
* Check if the text is a short engineering symbol (1-2 chars) that should
* be allowed to merge with adjacent elements regardless of fontSize ratio.
*/
private boolean isShortEngineeringSymbol(String text) {
if (text == null || text.isEmpty()) return false;
String trimmed = text.trim();
return trimmed.length() <= 2 && ENGINEERING_SYMBOLS.contains(trimmed);
}
}

View File

@ -1,16 +1,48 @@
package extraction;
import java.text.Normalizer;
public class TextNormalizer {
public static String normalizeText(String text) {
return text
.replace("\u00a1\u00a4", "\u03a6")
.replace("\u00a1\u00e3", "\u00b0")
.replace("\u00a1\u00c0", "\u00b1")
.replace("\u00a6\u00b5", "\u03a6")
.replace("\uffc3n\uffc3", "\u03a6")
.replace("\uffc3$\uffc3", "\u03a6")
.replace("\ufffdn\ufffd", "\u00d8")
.replace("\ufffd", "");
if (text == null || text.isEmpty()) return text;
String result = text
// --- Mac-specific garbled sequences (original) ---
.replace("\u00a1\u00a4", "\u03a6") // \u00a1\u00a4 (\u00a1\u00a4) -> \u03a6 (\u03a6)
.replace("\u00a1\u00e3", "\u00b0") // \u00a1\u00e3 (\u00a1\u00e3) -> \u00b0 (\u00b0)
.replace("\u00a1\u00c0", "\u00b1") // \u00a1\u00c0 (\u00a1\u00c0) -> \u00b1 (\u00b1)
.replace("\u00a6\u00b5", "\u03a6") // \u00a6\u00b5 (\u00a6\u00b5) -> \u03a6 (\u03a6)
.replace("\uffc3n\uffc3", "\u03a6") // \uffc3n\uffc3 -> \u03a6
.replace("\uffc3$\uffc3", "\u03a6") // \uffc3$\uffc3 -> \u03a6
.replace("\ufffdn\ufffd", "\u00d8") // \ufffdn\ufffd -> \u00d8
// --- Windows / GBK-specific garbled sequences ---
// GBK encodes CJK chars as 2-byte sequences; when a PDF glyph
// code is misread through GBK, it produces double-byte CJK chars
// or Latin-1 pairs that differ from the Mac variants.
.replace("\u03a6\u0080", "\u03a6") // trailing control char after Phi
.replace("\u00d8\u0080", "\u00d8") // trailing control char after O-stroke
.replace("\u00b1\u0080", "\u00b1") // trailing control char after plus-minus
// Windows-1252 / GBK mis-read of Symbol font glyph codes
.replace("\u0086", "\u2020") // dagger (Symbol 0x86)
.replace("\u0087", "\u2021") // double dagger (Symbol 0x87)
.replace("\u0089", "\u2030") // per mille (Symbol 0x89)
// Common GBK garbled pairs for engineering symbols
.replace("\u00c6\u00f8", "\u00d8") // \u00c6\u00f8 -> \u00d8
.replace("\u00a1\u00c1", "\u00b1") // variant of \u00a1\u00c0
// --- Remove remaining garbage ---
.replace("\ufffd", ""); // Unicode replacement char
// NFC normalization: canonical decomposition then recomposition
// This standardizes characters that have multiple Unicode representations
// (e.g., U+00C5 vs U+0041 U+030A for Å)
result = Normalizer.normalize(result, Normalizer.Form.NFC);
return result;
}
}

View File

@ -4,10 +4,13 @@ import lombok.Data;
@Data
public class TextElement {
private String text;
private int pageNum;
private float x, y;
private float width, height;
private float fontSize;
private float pageWidth, pageHeight;
private int seqNum;
}