feat(pdf-extract): 实现PDF尺寸和公差的提取识别功能

- 新增DimensionIdentifier用于识别PDF中的尺寸和多种公差格式
- 支持对称公差、非对称公差、配合公差和螺纹标注的正则匹配
- 实现基于TextGroup的文本预处理和位置判断,提高识别准确度
- 创建DimensionResult作为尺寸识别结果的封装实体
- 增加PdfExtractionService服务实现PDF解析、文本分组和尺寸提取流程
- 新增配置类PdfExtractConfig,提供文本分组和尺寸识别组件的Spring Bean
- 增加nflg-qms-pdf-extract模块及相关依赖管理,完成PDF尺寸提取的整体集成
This commit is contained in:
曹鹏飞 2026-05-06 15:49:28 +08:00
parent fe9db7ec86
commit e8142d0480
27 changed files with 1187 additions and 23 deletions

View File

@ -23,6 +23,10 @@
<groupId>com.nflg</groupId>
<artifactId>nflg-wms-starter</artifactId>
</dependency>
<dependency>
<groupId>com.nflg</groupId>
<artifactId>nflg-qms-pdf-extract</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>

View File

@ -0,0 +1,33 @@
package com.nflg.qms.admin.config;
import extraction.DimensionIdentifier;
import extraction.TextGrouper;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import service.PdfExtractionService;
import service.RegionFilterService;
@Configuration
public class PdfExtractConfig {
@Bean
public PdfExtractionService pdfExtractionService(TextGrouper textGrouper, DimensionIdentifier dimensionIdentifier) {
return new PdfExtractionService(textGrouper,dimensionIdentifier);
}
@Bean
public TextGrouper textGrouper() {
return new TextGrouper();
}
@Bean
public DimensionIdentifier dimensionIdentifier() {
return new DimensionIdentifier();
}
@Bean
public RegionFilterService regionFilterService() {
return new RegionFilterService();
}
}

View File

@ -1,18 +1,24 @@
package com.nflg.qms.admin.controller;
import com.nflg.qms.admin.pojo.qo.QmsInspectionStandardSaveQO;
import com.nflg.qms.admin.pojo.qo.QmsPdfExtractRegionQO;
import com.nflg.qms.admin.service.QmsInspectionStandardControllerService;
import com.nflg.wms.common.pojo.ApiResult;
import com.nflg.wms.common.pojo.PageData;
import com.nflg.wms.common.pojo.qo.*;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardDetailVO;
import com.nflg.qms.admin.pojo.vo.QmsInspectionStandardDetailVO;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardVO;
import com.nflg.wms.starter.BaseController;
import jakarta.annotation.Resource;
import jakarta.validation.Valid;
import jakarta.validation.constraints.NotNull;
import model.DimensionResult;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.*;
import java.io.IOException;
import java.util.List;
/**
* 检验标准
*/
@ -92,4 +98,12 @@ public class QmsInspectionStandardController extends BaseController {
inspectionStandardControllerService.delete(request.getIds());
return ApiResult.success();
}
/**
* PDF文件提取区域数据
*/
@PostMapping("/pdf/extract-region")
public ApiResult<List<DimensionResult>> pdfExtractRegion(@Valid @RequestBody QmsPdfExtractRegionQO request) throws IOException {
return ApiResult.success(inspectionStandardControllerService.pdfExtractRegion(request));
}
}

View File

@ -2,7 +2,7 @@ package com.nflg.qms.admin.controller;
import com.nflg.qms.admin.service.QmsInspectionStandardItemControllerService;
import com.nflg.wms.common.pojo.ApiResult;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardItemContentVO;
import com.nflg.qms.admin.pojo.vo.QmsInspectionStandardItemContentVO;
import com.nflg.wms.starter.BaseController;
import jakarta.annotation.Resource;
import jakarta.validation.constraints.NotNull;

View File

@ -1,9 +1,10 @@
package com.nflg.wms.common.pojo.qo;
package com.nflg.qms.admin.pojo.qo;
import jakarta.validation.Valid;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.NotNull;
import lombok.Data;
import model.TextPosition;
import java.util.List;
@ -167,7 +168,7 @@ public class QmsInspectionStandardSaveQO {
/**
* PDF信息
*/
private String pdfInfo;
private TextPosition pdfInfo;
/**
* 判定类型0直接判定1测量值

View File

@ -0,0 +1,51 @@
package com.nflg.qms.admin.pojo.qo;
import jakarta.validation.constraints.Min;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.NotNull;
import lombok.Data;
@Data
public class QmsPdfExtractRegionQO {
/**
* 文件地址
*/
@NotBlank
private String url;
/**
* 页码
*/
@Min(1)
@NotNull
private int pageNum;
/**
* x轴起始位置
*/
@Min(0)
@NotNull
private float x;
/**
* y轴起始位置
*/
@Min(0)
@NotNull
private float y;
/**
* 宽度
*/
@Min(0)
@NotNull
private float width;
/**
* 高度
*/
@Min(0)
@NotNull
private float height;
}

View File

@ -1,6 +1,7 @@
package com.nflg.wms.common.pojo.vo;
package com.nflg.qms.admin.pojo.vo;
import lombok.Data;
import model.TextPosition;
import java.time.LocalDateTime;
@ -38,7 +39,7 @@ public class QmsInspectionStandardItemContentVO {
/**
* PDF信息
*/
private String pdfInfo;
private TextPosition pdfInfo;
/**
* 判定类型0直接判定1测量值

View File

@ -1,4 +1,4 @@
package com.nflg.wms.common.pojo.vo;
package com.nflg.qms.admin.pojo.vo;
import lombok.Data;

View File

@ -1,18 +1,22 @@
package com.nflg.qms.admin.service;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.convert.Convert;
import cn.hutool.core.util.StrUtil;
import cn.hutool.json.JSONUtil;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.nflg.qms.admin.pojo.qo.QmsPdfExtractRegionQO;
import com.nflg.wms.common.exception.NflgException;
import com.nflg.wms.common.pojo.PageData;
import com.nflg.wms.common.pojo.dto.QmsInspectionStandardDetailDTO;
import com.nflg.wms.common.pojo.qo.QmsInspectionStandardAddQO;
import com.nflg.wms.common.pojo.qo.QmsInspectionStandardEditQO;
import com.nflg.wms.common.pojo.qo.QmsInspectionStandardSaveQO;
import com.nflg.qms.admin.pojo.qo.QmsInspectionStandardSaveQO;
import com.nflg.wms.common.pojo.qo.QmsInspectionStandardSearchQO;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardDetailVO;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardItemContentVO;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardItemVO;
import com.nflg.qms.admin.pojo.vo.QmsInspectionStandardDetailVO;
import com.nflg.qms.admin.pojo.vo.QmsInspectionStandardItemContentVO;
import com.nflg.qms.admin.pojo.vo.QmsInspectionStandardItemVO;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardVO;
import com.nflg.wms.common.util.UserUtil;
import com.nflg.wms.common.util.VUtil;
@ -21,9 +25,14 @@ import com.nflg.wms.repository.mapper.QmsInspectionStandardMapper;
import com.nflg.wms.repository.service.*;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import model.DimensionResult;
import model.TextPosition;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import service.PdfExtractionService;
import service.RegionFilterService;
import java.io.IOException;
import java.time.LocalDateTime;
import java.util.*;
import java.util.stream.Collectors;
@ -59,6 +68,12 @@ public class QmsInspectionStandardControllerService {
@Resource
private IQmsAqlPriorityValueService aqlPriorityValueService;
@Resource
private PdfExtractionService pdfExtractionService;
@Resource
private RegionFilterService regionFilterService;
/**
* 分页查询检验标准
*/
@ -153,10 +168,10 @@ public class QmsInspectionStandardControllerService {
}
// 2. 查询检验标准详情关联物料等信息
QmsInspectionStandardDetailVO detail = inspectionStandardMapper.getDetailById(id);
if (detail == null) {
detail = new QmsInspectionStandardDetailVO();
}
QmsInspectionStandardDetailDTO detailDTO = inspectionStandardMapper.getDetailById(id);
QmsInspectionStandardDetailVO detail = Objects.isNull(detailDTO)
? new QmsInspectionStandardDetailVO()
: Convert.convert(QmsInspectionStandardDetailVO.class, detailDTO);
// 填充基础字段
detail.setId(standard.getId());
@ -289,7 +304,9 @@ public class QmsInspectionStandardControllerService {
vo.setName(content.getName());
vo.setTestStandard(content.getTestStandard());
vo.setLegend(content.getLegend());
vo.setPdfInfo(content.getPdfInfo());
if (StrUtil.isNotBlank(content.getPdfInfo())) {
vo.setPdfInfo(JSONUtil.toBean(content.getPdfInfo(), TextPosition.class));
}
vo.setJudgmentType(content.getJudgmentType());
vo.setCreateUserName(content.getCreateUserName());
vo.setCreateTime(content.getCreateTime());
@ -665,7 +682,7 @@ public class QmsInspectionStandardControllerService {
content.setLegend(qo.getLegend());
}
if (qo.getPdfInfo() != null) {
content.setPdfInfo(qo.getPdfInfo());
content.setPdfInfo(JSONUtil.toJsonStr(qo.getPdfInfo()));
}
content.setJudgmentType(qo.getJudgmentType());
@ -680,4 +697,17 @@ public class QmsInspectionStandardControllerService {
content.setCreateTime(now);
}
}
public List<DimensionResult> pdfExtractRegion(QmsPdfExtractRegionQO request) throws IOException {
List<DimensionResult> allResults = pdfExtractionService.extractAllDimensionsForRegion(request.getUrl());
List<DimensionResult> filtered = regionFilterService.filterByRegion(
allResults,
request.getPageNum(),
request.getX(),
request.getY(),
request.getWidth(),
request.getHeight()
);
return regionFilterService.mergeRegionResults(filtered);
}
}

View File

@ -1,13 +1,16 @@
package com.nflg.qms.admin.service;
import cn.hutool.core.util.StrUtil;
import cn.hutool.json.JSONUtil;
import com.nflg.wms.common.exception.NflgException;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardItemContentVO;
import com.nflg.qms.admin.pojo.vo.QmsInspectionStandardItemContentVO;
import com.nflg.wms.repository.entity.QmsInspectionStandardItem;
import com.nflg.wms.repository.entity.QmsInspectionStandardItemContent;
import com.nflg.wms.repository.service.IQmsInspectionStandardItemContentService;
import com.nflg.wms.repository.service.IQmsInspectionStandardItemService;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import model.TextPosition;
import org.springframework.stereotype.Component;
import java.util.List;
@ -59,7 +62,9 @@ public class QmsInspectionStandardItemControllerService {
vo.setName(content.getName());
vo.setTestStandard(content.getTestStandard());
vo.setLegend(content.getLegend());
vo.setPdfInfo(content.getPdfInfo());
if (StrUtil.isNotBlank(content.getPdfInfo())) {
vo.setPdfInfo(JSONUtil.toBean(content.getPdfInfo(), TextPosition.class));
}
vo.setJudgmentType(content.getJudgmentType());
vo.setCreateUserName(content.getCreateUserName());
vo.setCreateTime(content.getCreateTime());

View File

@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.nflg</groupId>
<artifactId>nflg-wms</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<artifactId>nflg-qms-pdf-extract</artifactId>
<name>模块-qms pdf数据提取</name>
<description>从pdf中提取数据</description>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<scope>provided</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.4</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>2.0.12</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,307 @@
package extraction;
import model.DimensionResult;
import model.TextGroup;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DimensionIdentifier {
// 尺寸 + 对称公差
private static final Pattern PAT_DIM_SYM_TOL = Pattern.compile(
"([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*[±]\\s*(\\d+\\.?\\d*)");
// 尺寸 + 非对称公差斜线分隔
private static final Pattern PAT_DIM_ASYM_TOL = Pattern.compile(
"([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([+-]\\d+\\.?\\d*)\\s*/\\s*([+-]\\d+\\.?\\d*)");
// 尺寸 + 非对称公差空格分隔
private static final Pattern PAT_DIM_LIMIT_TOL = Pattern.compile(
"([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)\\s+([+-]\\d+\\.?\\d*)");
// 尺寸 + 配合公差代号
private static final Pattern PAT_DIM_FIT = Pattern.compile(
"([ΦφØ∅]?\\s*\\d+\\.?\\d*)\\s*([A-HJ-Zj-z]\\d{1,2})\\b");
// 螺纹标注M型公制螺纹 + G型管螺纹 + Rc/NPT等
private static final Pattern PAT_THREAD = Pattern.compile(
"(M\\d+\\.?\\d*(?:\\s*[xX×]\\s*\\d+\\.?\\d*)?|(?:G|Rc|Rp|NPT)\\s*\\d+(?:/\\d+)?)");
// 独立公差文本
private static final Pattern PAT_TOLERANCE = Pattern.compile(
"[±]\\s*\\d+\\.?\\d*|[+-]\\s*\\d+\\.?\\d*");
// 复合公差文本
private static final Pattern PAT_COMPOUND_TOL = Pattern.compile(
"^([+-]\\d+\\.?\\d*)\\s+(0|[+-]?\\d+\\.?\\d*)$");
// 纯尺寸数值
private static final Pattern PAT_PLAIN_DIM = Pattern.compile(
"([ΦφØ∅]\\s*\\d+\\.?\\d*|[Rr]\\d+\\.?\\d*|[Cc]\\d+\\.?\\d*|\\d+\\.\\d+|\\d+)");
public List<DimensionResult> identifyDimensions(List<TextGroup> groups) {
return identifyDimensions(groups, true);
}
public List<DimensionResult> identifyDimensions(List<TextGroup> groups, boolean toleranceOnly) {
List<DimensionResult> results = new ArrayList<>();
Set<Integer> processed = new HashSet<>();
Set<Integer> usedAsTolerance = new HashSet<>();
for (int i = 0; i < groups.size(); i++) {
if (processed.contains(i)) continue;
TextGroup g = groups.get(i);
String text = TextNormalizer.normalizeText(g.getText().trim());
// --- 基本过滤两种模式都适用---
if (text.length() > 40) continue;
if (text.isEmpty()) continue;
if (text.matches("^[A-Za-z\\s]+$")) continue;
if (TitleBlockFilter.isInTitleBlockRegion(g)) continue;
if (toleranceOnly) {
// === toleranceOnly 模式严格过滤 ===
Matcher threadEarly = PAT_THREAD.matcher(text);
boolean isThreadText = threadEarly.find() && threadEarly.start() == 0;
if (!isThreadText && TitleBlockFilter.shouldSkipText(text)) continue;
if (TitleBlockFilter.isToleranceOnly(text)) continue;
if (TitleBlockFilter.isSurfaceRoughness(text)) continue;
if (TitleBlockFilter.isGdtTolerance(text)) continue;
if (text.matches(".*\\d+\\.\\d+\\d+\\.\\d+.*") && !text.contains("±") && !text.contains("/")) continue;
}
// === 区域模式!toleranceOnly不做内容过滤直接进入模式匹配 ===
Matcher m;
// 0) 复合公差文本
m = PAT_COMPOUND_TOL.matcher(text);
if (m.matches()) {
processed.add(i);
continue;
}
// 1) 对称公差
m = PAT_DIM_SYM_TOL.matcher(text);
if (m.find()) {
results.add(new DimensionResult(
m.group(1).trim(), "±" + m.group(2), "dimension", g));
processed.add(i);
continue;
}
// 2) 非对称公差斜线
m = PAT_DIM_ASYM_TOL.matcher(text);
if (m.find()) {
results.add(new DimensionResult(
m.group(1).trim(), m.group(2) + "/" + m.group(3), "dimension", g));
processed.add(i);
continue;
}
// 3) 非对称公差空格
m = PAT_DIM_LIMIT_TOL.matcher(text);
if (m.find()) {
results.add(new DimensionResult(
m.group(1).trim(), m.group(2) + " " + m.group(3), "dimension", g));
processed.add(i);
continue;
}
// 4) 配合公差
m = PAT_DIM_FIT.matcher(text);
if (m.find() && !text.contains("-") && !text.contains("/")) {
results.add(new DimensionResult(
m.group(1).trim(), m.group(2), "dimension", g));
processed.add(i);
continue;
}
// 5) 螺纹标注 - toleranceOnly 模式下跳过
m = PAT_THREAD.matcher(text);
if (m.find() && m.start() == 0) {
if (!toleranceOnly) {
results.add(new DimensionResult(text, null, "dimension", g));
processed.add(i);
}
continue;
}
// 6) 纯尺寸数值
m = PAT_PLAIN_DIM.matcher(text);
if (m.find()) {
String dim = m.group(1).trim();
int maxExtra = toleranceOnly ? 3 : 15;
if (text.length() > dim.length() + maxExtra) {
// 区域模式下超长文本兜底输出全文
if (!toleranceOnly) {
results.add(new DimensionResult(text, null, "dimension", g));
processed.add(i);
}
continue;
}
String numPart = dim.replaceAll("[^\\d.]", "");
if (numPart.isEmpty()) continue;
double val;
try {
val = Double.parseDouble(numPart);
double minVal = toleranceOnly ? 0.5 : 0.001;
if (val < minVal || val > 9999) continue;
} catch (NumberFormatException e) {
continue;
}
String nearbyTol = findNearbyTolerance(g, groups, i, usedAsTolerance);
// toleranceOnly 模式下仅输出带公差的尺寸
if (toleranceOnly && nearbyTol == null) continue;
// toleranceOnly 模式下单字符无公差无Φ符号 输出完整文本而非跳过
if (!toleranceOnly && nearbyTol == null && dim.length() == 1 && !hasNearbyPhiSymbol(g, groups)) {
if (text.length() > 1) {
results.add(new DimensionResult(text, null, "dimension", g));
processed.add(i);
}
continue;
}
// 区域模式下使用完整文本含描述toleranceOnly 模式只用尺寸值
String dimText = (!toleranceOnly && text.length() > dim.length() + 3) ? text : dim;
results.add(new DimensionResult(dimText, nearbyTol, "dimension", g));
processed.add(i);
continue;
}
// 7) 区域模式兜底未被任何模式匹配的文本直接作为原始内容输出
if (!toleranceOnly) {
results.add(new DimensionResult(text, null, "dimension", g));
processed.add(i);
}
}
return results;
}
private boolean hasNearbyPhiSymbol(TextGroup dimGroup, List<TextGroup> allGroups) {
float searchDist = dimGroup.getFontSize() * 3.0f;
for (TextGroup other : allGroups) {
if (other.getPageNum() != dimGroup.getPageNum()) continue;
String t = other.getText().trim();
if (!t.equals("¡¤") && !t.equals("Φ") && !t.equals("φ") && !t.equals("Ø") && !t.equals(""))
continue;
float dx = Math.abs(other.getX() - dimGroup.getX());
float dy = Math.abs(other.getY() - dimGroup.getY());
if (dx < searchDist && dy < searchDist) return true;
}
return false;
}
private String findNearbyTolerance(TextGroup dimGroup, List<TextGroup> allGroups,
int dimIndex, Set<Integer> usedAsTolerance) {
float effWidth = dimGroup.getWidth() > 0 ? dimGroup.getWidth()
: dimGroup.getFontSize() * dimGroup.getText().trim().length() * 0.5f;
float searchXRight = dimGroup.getFontSize() * 2.5f;
float searchYRight = dimGroup.getFontSize() * 1.5f;
float searchXVert = dimGroup.getFontSize() * 1.0f;
float searchYVert = dimGroup.getFontSize() * 5.0f;
List<String> tolParts = new ArrayList<>();
List<Integer> tolIndices = new ArrayList<>();
for (int i = 0; i < allGroups.size(); i++) {
if (i == dimIndex) continue;
if (usedAsTolerance.contains(i)) continue;
TextGroup other = allGroups.get(i);
if (other.getPageNum() != dimGroup.getPageNum()) continue;
float dxFromRight = other.getX() - (dimGroup.getX() + effWidth);
float absDxFromStart = Math.abs(other.getX() - dimGroup.getX());
float dy = Math.abs(other.getY() - dimGroup.getY());
boolean rightZone = dxFromRight > -effWidth * 0.3f
&& dxFromRight < searchXRight && dy < searchYRight;
boolean vertZone = absDxFromStart < searchXVert
&& dy > searchYRight && dy < searchYVert;
if (!rightZone && !vertZone) continue;
if (rightZone && !vertZone) {
boolean hasBetterCandidate = false;
for (TextGroup cand : allGroups) {
if (cand == dimGroup || cand.getPageNum() != dimGroup.getPageNum()) continue;
if (Math.abs(cand.getFontSize() - dimGroup.getFontSize()) > 1.0f) continue;
String candText = TextNormalizer.normalizeText(cand.getText().trim());
if (TitleBlockFilter.isSurfaceRoughness(candText) || TitleBlockFilter.isToleranceOnly(candText)) continue;
float candDxFromTol = Math.abs(other.getX() - cand.getX());
float candDyFromTol = Math.abs(other.getY() - cand.getY());
if (candDxFromTol < searchXVert
&& candDyFromTol > searchYRight && candDyFromTol < searchYVert) {
hasBetterCandidate = true;
break;
}
}
if (hasBetterCandidate) continue;
}
String otherText = TextNormalizer.normalizeText(other.getText().trim());
if (TitleBlockFilter.isSurfaceRoughness(otherText)) continue;
boolean bareSmallDecimal = (rightZone || vertZone) && otherText.matches("^0\\.\\d{1,3}$");
if (TitleBlockFilter.isGdtTolerance(otherText) && !bareSmallDecimal) continue;
if (other.getFontSize() <= dimGroup.getFontSize() * 0.9 || otherText.contains("±") || bareSmallDecimal) {
Matcher cm = PAT_COMPOUND_TOL.matcher(otherText);
if (cm.matches()) {
tolParts.clear();
tolIndices.clear();
tolParts.add(cm.group(1) + " / " + cm.group(2));
tolIndices.add(i);
break;
}
Matcher tm = PAT_TOLERANCE.matcher(otherText);
if (tm.find()) {
if (tm.start() > 0 && Character.isDigit(otherText.charAt(tm.start() - 1))) {
continue;
}
tolParts.add(otherText);
tolIndices.add(i);
continue;
}
if (otherText.equals("0") && other.getFontSize() < dimGroup.getFontSize() * 0.85) {
tolParts.add("0");
tolIndices.add(i);
continue;
}
if (bareSmallDecimal) {
tolParts.add("±" + otherText);
tolIndices.add(i);
continue;
}
}
}
if (!tolParts.isEmpty()) {
usedAsTolerance.addAll(tolIndices);
tolParts.sort((a, b) -> {
boolean aPos = a.startsWith("+") || a.startsWith("±");
boolean bPos = b.startsWith("+") || b.startsWith("±");
return Boolean.compare(bPos, aPos);
});
return String.join(" / ", tolParts);
}
return null;
}
}

View File

@ -0,0 +1,59 @@
package extraction;
import model.TextElement;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class PositionedTextStripper extends PDFTextStripper {
private final List<TextElement> elements = new ArrayList<>();
private int currentPage = 0;
private float currentPageWidth = 0;
private float currentPageHeight = 0;
@Override
protected void startPage(PDPage page) throws IOException {
currentPage++;
PDRectangle mediaBox = page.getMediaBox();
currentPageWidth = mediaBox.getWidth();
currentPageHeight = mediaBox.getHeight();
super.startPage(page);
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
if (textPositions == null || textPositions.isEmpty()) {
super.writeString(text, textPositions);
return;
}
TextPosition first = textPositions.get(0);
TextPosition last = textPositions.get(textPositions.size() - 1);
TextElement elem = new TextElement();
elem.setText(text.trim());
elem.setPageNum(currentPage);
elem.setX(first.getX());
elem.setY(first.getY());
elem.setWidth(Math.abs((last.getX() + last.getWidth()) - first.getX()));
elem.setHeight(Math.max(1, Math.abs(first.getHeight())));
elem.setFontSize(first.getFontSizeInPt());
elem.setPageWidth(currentPageWidth);
elem.setPageHeight(currentPageHeight);
if (!elem.getText().isEmpty()) {
elements.add(elem);
}
super.writeString(text, textPositions);
}
public List<TextElement> getElements() {
return elements;
}
}

View File

@ -0,0 +1,86 @@
package extraction;
import model.TextElement;
import model.TextGroup;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
public class TextGrouper {
public List<TextGroup> groupTextElements(List<TextElement> elements) {
if (elements.isEmpty()) return Collections.emptyList();
List<TextElement> sorted = new ArrayList<>(elements);
sorted.sort(Comparator
.comparingInt(TextElement::getPageNum)
.thenComparingDouble(TextElement::getY)
.thenComparingDouble(TextElement::getX));
List<TextGroup> groups = new ArrayList<>();
TextGroup current = null;
for (TextElement elem : sorted) {
boolean merge = false;
if (current != null
&& current.getPageNum() == elem.getPageNum()
&& Math.abs(current.getY() - elem.getY()) < Math.max(elem.getHeight(), current.getHeight()) * 0.5
&& (elem.getX() - (current.getX() + current.getWidth())) < Math.max(elem.getFontSize(), current.getFontSize()) * 0.8
&& (elem.getX() - (current.getX() + current.getWidth())) > -Math.max(elem.getFontSize(), current.getFontSize()) * 2) {
float maxFs = Math.max(current.getFontSize(), elem.getFontSize());
float minFs = Math.min(current.getFontSize(), elem.getFontSize());
if (maxFs > 0 && minFs / maxFs < 0.7f) {
merge = false;
} else {
boolean curEndsWithDigit = !current.getText().isEmpty()
&& Character.isDigit(current.getText().charAt(current.getText().length() - 1));
boolean elemStartsWithDigit = !elem.getText().isEmpty()
&& Character.isDigit(elem.getText().charAt(0));
if (curEndsWithDigit && elemStartsWithDigit) {
if (current.getText().contains(".") && elem.getText().contains(".")) {
merge = false;
} else if (current.getText().matches("\\d+\\.?\\d*")) {
merge = false;
} else {
merge = true;
}
} else {
merge = true;
}
}
}
if (merge) {
float gap = elem.getX() - (current.getX() + current.getWidth());
if (gap > elem.getFontSize() * 0.3) {
current.setText(current.getText() + " ");
}
current.setText(current.getText() + elem.getText());
float newRight = Math.max(current.getX() + current.getWidth(), elem.getX() + elem.getWidth());
float newLeft = Math.min(current.getX(), elem.getX());
current.setX(newLeft);
current.setWidth(newRight - newLeft);
current.setHeight(Math.max(current.getHeight(), elem.getHeight()));
current.getElements().add(elem);
} else {
current = new TextGroup();
current.setText(elem.getText());
current.setPageNum(elem.getPageNum());
current.setX(elem.getX());
current.setY(elem.getY());
current.setWidth(elem.getWidth());
current.setHeight(elem.getHeight());
current.setFontSize(elem.getFontSize());
current.setPageWidth(elem.getPageWidth());
current.setPageHeight(elem.getPageHeight());
current.getElements().add(elem);
groups.add(current);
}
}
return groups;
}
}

View File

@ -0,0 +1,16 @@
package extraction;
public class TextNormalizer {
public static String normalizeText(String text) {
return text
.replace("\u00a1\u00a4", "\u03a6")
.replace("\u00a1\u00e3", "\u00b0")
.replace("\u00a1\u00c0", "\u00b1")
.replace("\u00a6\u00b5", "\u03a6")
.replace("\uffc3n\uffc3", "\u03a6")
.replace("\uffc3$\uffc3", "\u03a6")
.replace("\ufffdn\ufffd", "\u00d8")
.replace("\ufffd", "");
}
}

View File

@ -0,0 +1,54 @@
package extraction;
import model.TextGroup;
import java.util.regex.Pattern;
public class TitleBlockFilter {
private static final Pattern PAT_SKIP_TEXT = Pattern.compile(
".*(" +
"GB/T|QT\\d{3}|CR\\d{3}|NF|" +
"比例|截面|技术要求|锪平|螺纹深度|通孔|配作|" +
"\\s*计|校\\s*对|审|批\\s*准|工\\s*艺|" +
"\\d{2}-\\d{2}-\\d{4}|" +
"\\d{3}-[A-Z]\\d{2}|" +
"版本|序号|修\\s*订|编\\s*码|代\\s*号|" +
"\\s*量|名\\s*称|材\\s*料|备\\s*注|数量|" +
"页|共|阶\\s*段|标\\s*记|分\\s*区|更改|" +
"级|涂层|膜厚|颜色|RAL|" +
"铸[件造]|拔模|未注|倒角|去毛刺|热处理|" +
"凸起|文字|Work in|JINRONG|" +
"螺栓|螺纹|圆柱销|轴承座|上盖|底座|分体|" +
"福建|南方|机械|有限公司|" +
"其余|首版|赖金荣|10\\.9级|" +
"单件|总计|腐蚀|丙烯酸|石墨烯|防腐|配套加工|" +
"检验|标准化|设计变更" +
").*", Pattern.CASE_INSENSITIVE);
public static boolean shouldSkipText(String text) {
return PAT_SKIP_TEXT.matcher(text).matches();
}
public static boolean isInTitleBlockRegion(TextGroup g) {
float relY = g.getY() / g.getPageHeight();
float relX = g.getX() / g.getPageWidth();
if (relX > 0.55 && relY > 0.60) return true;
return relY > 0.77;
}
public static boolean isToleranceOnly(String text) {
String t = text.trim();
return t.matches("^[+-]\\s*\\d+\\.?\\d*$");
}
public static boolean isSurfaceRoughness(String text) {
String t = text.trim();
return t.matches("^(0\\.4|0\\.8|1\\.6|3\\.2|6\\.3|12\\.5|25|50)$");
}
public static boolean isGdtTolerance(String text) {
String t = text.trim();
return t.matches("^0\\.\\d{1,3}(\\s+[A-Z]\\d?)?$");
}
}

View File

@ -0,0 +1,31 @@
package model;
import lombok.Data;
@Data
public class DimensionResult {
private String dimension;
private String tolerance;
private String type;
private double x, y;
private double width, height;
private int page;
private int sortOrder;
public DimensionResult() {}
public DimensionResult(String dimension, String tolerance, String type, TextGroup g) {
this.dimension = dimension;
this.tolerance = tolerance;
this.type = type;
this.x = round(g.getX());
this.y = round(g.getY() - g.getHeight());
this.width = round(g.getWidth());
this.height = round(g.getHeight());
this.page = g.getPageNum();
}
private static double round(float v) {
return Math.round(v * 100.0) / 100.0;
}
}

View File

@ -0,0 +1,13 @@
package model;
import lombok.Data;
@Data
public class TextElement {
private String text;
private int pageNum;
private float x, y;
private float width, height;
private float fontSize;
private float pageWidth, pageHeight;
}

View File

@ -0,0 +1,16 @@
package model;
import lombok.Data;
import java.util.ArrayList;
import java.util.List;
@Data
public class TextGroup {
private String text;
private int pageNum;
private float x, y, width, height;
private float fontSize;
private float pageWidth, pageHeight;
private List<TextElement> elements = new ArrayList<>();
}

View File

@ -0,0 +1,47 @@
package model;
import lombok.Data;
@Data
public class TextPosition {
/**
* 页码
*/
private int pageNum;
/**
* x轴起始位置
*/
private float x;
/**
* y轴起始位置
*/
private float y;
/**
* 宽度
*/
private float width;
/**
* 高度
*/
private float height;
/**
* 页宽
*/
private float pageWidth;
/**
* 页高
*/
private float pageHeight;
/**
* 序号
*/
private int sortOrder;
}

View File

@ -0,0 +1,95 @@
package service;
import extraction.DimensionIdentifier;
import extraction.PositionedTextStripper;
import extraction.TextGrouper;
import lombok.extern.slf4j.Slf4j;
import model.DimensionResult;
import model.TextElement;
import model.TextGroup;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.URL;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
@Slf4j
public class PdfExtractionService {
private final TextGrouper textGrouper;
private final DimensionIdentifier dimensionIdentifier;
public PdfExtractionService(TextGrouper textGrouper,
DimensionIdentifier dimensionIdentifier) {
this.textGrouper = textGrouper;
this.dimensionIdentifier = dimensionIdentifier;
}
public ExtractionResult extractDimensions(Path pdfPath, String fileId) throws IOException {
long start = System.currentTimeMillis();
File file = pdfPath.toFile();
try (PDDocument document = Loader.loadPDF(file)) {
int totalPages = document.getNumberOfPages();
PositionedTextStripper stripper = new PositionedTextStripper();
stripper.setSortByPosition(true);
StringWriter writer = new StringWriter();
stripper.writeText(document, writer);
List<TextElement> allElements = stripper.getElements();
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
List<DimensionResult> dimensions = dimensionIdentifier.identifyDimensions(groups);
long elapsed = System.currentTimeMillis() - start;
log.info("Extracted {} dimensions from {} in {}ms", dimensions.size(), pdfPath.getFileName(), elapsed);
return new ExtractionResult(dimensions, totalPages);
}
}
/**
* 区域提取不限公差过滤包含所有尺寸不保存到数据库
*/
public List<DimensionResult> extractAllDimensionsForRegion(String pdfUrl) throws IOException {
byte[] pdfBytes;
try (InputStream in = new URL(pdfUrl).openStream();
ByteArrayOutputStream buffer = new ByteArrayOutputStream()) {
byte[] chunk = new byte[8192];
int len;
while ((len = in.read(chunk)) != -1) {
buffer.write(chunk, 0, len);
}
pdfBytes = buffer.toByteArray();
}
try (PDDocument document = Loader.loadPDF(pdfBytes)) {
PositionedTextStripper stripper = new PositionedTextStripper();
stripper.setSortByPosition(true);
StringWriter writer = new StringWriter();
stripper.writeText(document, writer);
List<TextElement> allElements = stripper.getElements();
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
return dimensionIdentifier.identifyDimensions(groups, false);
}
}
public static class ExtractionResult {
private final List<DimensionResult> dimensions;
private final int totalPages;
public ExtractionResult(List<DimensionResult> dimensions, int totalPages) {
this.dimensions = dimensions;
this.totalPages = totalPages;
}
public List<DimensionResult> getDimensions() { return dimensions; }
public int getTotalPages() { return totalPages; }
}
}

View File

@ -0,0 +1,90 @@
package service;
import model.DimensionResult;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
public class RegionFilterService {
public List<DimensionResult> filterByRegion(List<DimensionResult> all,
int page, double rx, double ry, double rw, double rh) {
return all.stream()
.filter(d -> d.getPage() == page)
.filter(d -> intersects(d, rx, ry, rw, rh))
.collect(Collectors.toList());
}
private boolean intersects(DimensionResult d, double rx, double ry, double rw, double rh) {
return d.getX() + d.getWidth() >= rx
&& d.getX() <= rx + rw
&& d.getY() + d.getHeight() >= ry
&& d.getY() <= ry + rh;
}
private static final Set<String> STANDALONE_SYMBOLS = Set.of("Φ", "φ", "Ø", "", "¡¤");
/**
* 将区域筛选后的多条结果合并为一条按阅读顺序拼接文本取并集边界框
*/
public List<DimensionResult> mergeRegionResults(List<DimensionResult> filtered) {
if (filtered == null || filtered.size() <= 1) {
return filtered;
}
// 过滤掉独立装饰符号单字符 Φ/Ø
List<DimensionResult> meaningful = filtered.stream()
.filter(d -> {
String dim = d.getDimension() == null ? "" : d.getDimension().trim();
return !STANDALONE_SYMBOLS.contains(dim);
})
.collect(Collectors.toList());
if (meaningful.isEmpty()) {
return filtered;
}
if (meaningful.size() == 1) {
return meaningful;
}
// 按阅读顺序排序Y 升序X 升序
meaningful.sort(Comparator.comparingDouble(DimensionResult::getY)
.thenComparingDouble(DimensionResult::getX));
// 拼接文本
StringBuilder sb = new StringBuilder();
for (DimensionResult d : meaningful) {
String dim = d.getDimension() == null ? "" : d.getDimension().trim();
sb.append(dim);
}
// 合并 tolerance
List<String> tols = meaningful.stream()
.map(DimensionResult::getTolerance)
.filter(t -> t != null && !t.isEmpty())
.collect(Collectors.toList());
String mergedTol = tols.isEmpty() ? null : String.join(" / ", tols);
// 计算并集边界框
double minX = meaningful.stream().mapToDouble(DimensionResult::getX).min().orElse(0);
double minY = meaningful.stream().mapToDouble(DimensionResult::getY).min().orElse(0);
double maxX = meaningful.stream().mapToDouble(d -> d.getX() + d.getWidth()).max().orElse(0);
double maxY = meaningful.stream().mapToDouble(d -> d.getY() + d.getHeight()).max().orElse(0);
DimensionResult merged = new DimensionResult();
merged.setDimension(sb.toString());
merged.setTolerance(mergedTol);
merged.setType("dimension");
merged.setX(minX);
merged.setY(minY);
merged.setWidth(maxX - minX);
merged.setHeight(maxY - minY);
merged.setPage(meaningful.get(0).getPage());
return Collections.singletonList(merged);
}
}

View File

@ -0,0 +1,170 @@
package com.nflg.wms.common.pojo.dto;
import lombok.Data;
import java.math.BigDecimal;
import java.time.LocalDateTime;
@Data
public class QmsInspectionStandardDetailDTO {
/**
* 检验标准ID
*/
private Long id;
/**
* 物料ID
*/
private Long materialId;
/**
* 物料编号
*/
private String materialNo;
/**
* 物料类别路径名称
*/
private String materialCategoryCodePathName;
/**
* 物料描述
*/
private String materialDesc;
/**
* 图号版本号
*/
private String drawingNoVer;
/**
* 图纸URL
*/
private String drawingUrl;
/**
* 版本号
*/
private String version;
/**
* 是否启用
*/
private Boolean isEnabled;
/**
* 包装方式ID
*/
private Long packagingMethodId;
/**
* 检验周期
*/
private Integer inspectionCycle;
/**
* 检测方式字典项ID
*/
private Long testingMethodDictItemId;
/**
* 检测方式字典项名称
*/
private String testingMethodDictItemName;
/**
* 抽样方案ID
*/
private Long samplingPlanId;
/**
* 抽样方案名称
*/
private String samplingPlanName;
/**
* 检验水平字典项ID
*/
private Long inspectionLevelDictItemId;
/**
* 检验水平字典项名称
*/
private String inspectionLevelDictItemName;
/**
* AQL值字典项ID
*/
private Long aqlPriorityValueId;
/**
* AQL值
*/
private BigDecimal aqlPriorityValue;
/**
* AQL类型字典项ID
*/
private Long aqlTypeDictItemId;
/**
* AQL类型字典项名称
*/
private String aqlTypeDictItemName;
/**
* 发布状态0-未发布1-已发布
*/
private Short publishStatus;
/**
* 发布人ID
*/
private Long publishUserId;
/**
* 发布人姓名
*/
private String publishUserName;
/**
* 发布时间
*/
private LocalDateTime publishTime;
/**
* 所属IQE姓名
*/
private String iqeName;
/**
* 创建人ID
*/
private Long createUserId;
/**
* 创建人姓名
*/
private String createUserName;
/**
* 创建时间
*/
private LocalDateTime createTime;
/**
* 更新人ID
*/
private Long updateUserId;
/**
* 更新人姓名
*/
private String updateUserName;
/**
* 更新时间
*/
private LocalDateTime updateTime;
}

View File

@ -3,9 +3,9 @@ package com.nflg.wms.repository.mapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.nflg.wms.common.pojo.dto.QmsInspectionStandardDetailDTO;
import com.nflg.wms.common.pojo.qo.QmsInspectionStandardSearchQO;
import com.nflg.wms.common.pojo.vo.QmsIncomingInspectionTaskCheckItemVO;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardDetailVO;
import com.nflg.wms.common.pojo.vo.QmsInspectionStandardVO;
import com.nflg.wms.repository.entity.QmsInspectionStandard;
import org.apache.ibatis.annotations.Param;
@ -25,7 +25,7 @@ public interface QmsInspectionStandardMapper extends BaseMapper<QmsInspectionSta
/**
* 根据ID查询检验标准详情关联物料等信息
*/
QmsInspectionStandardDetailVO getDetailById(@Param("id") Long id);
QmsInspectionStandardDetailDTO getDetailById(@Param("id") Long id);
List<QmsIncomingInspectionTaskCheckItemVO> getItemsForCheck(Long id);
}

View File

@ -53,7 +53,7 @@
<!--
根据ID查询检验标准详情关联物料等信息
-->
<select id="getDetailById" resultType="com.nflg.wms.common.pojo.vo.QmsInspectionStandardDetailVO">
<select id="getDetailById" resultType="com.nflg.wms.common.pojo.dto.QmsInspectionStandardDetailDTO">
SELECT
s.id,
s.material_id AS materialId,

View File

@ -25,6 +25,7 @@
<module>nflg-wms-srm-receive</module>
<module>nflg-wms-shipment</module>
<module>nflg-qms-admin</module>
<module>nflg-qms-pdf-extract</module>
</modules>
<properties>
<java.version>17</java.version>
@ -125,6 +126,11 @@
<artifactId>nflg-wms-repository</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.nflg</groupId>
<artifactId>nflg-qms-pdf-extract</artifactId>
<version>${project.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>com.mysql</groupId>-->
<!-- <artifactId>mysql-connector-j</artifactId>-->