2026-05-06 15:49:28 +08:00
|
|
|
|
package service;
|
|
|
|
|
|
|
|
|
|
|
|
import extraction.DimensionIdentifier;
|
|
|
|
|
|
import extraction.PositionedTextStripper;
|
|
|
|
|
|
import extraction.TextGrouper;
|
|
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
|
|
import model.DimensionResult;
|
|
|
|
|
|
import model.TextElement;
|
|
|
|
|
|
import model.TextGroup;
|
|
|
|
|
|
import org.apache.pdfbox.Loader;
|
|
|
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
|
|
|
|
|
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
|
import java.io.InputStream;
|
|
|
|
|
|
import java.io.StringWriter;
|
2026-05-08 09:03:02 +08:00
|
|
|
|
import java.net.MalformedURLException;
|
2026-05-06 15:49:28 +08:00
|
|
|
|
import java.net.URL;
|
|
|
|
|
|
import java.nio.file.Path;
|
|
|
|
|
|
import java.util.List;
|
|
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
|
|
|
|
@Slf4j
|
|
|
|
|
|
public class PdfExtractionService {
|
|
|
|
|
|
|
|
|
|
|
|
private final TextGrouper textGrouper;
|
|
|
|
|
|
private final DimensionIdentifier dimensionIdentifier;
|
|
|
|
|
|
|
|
|
|
|
|
public PdfExtractionService(TextGrouper textGrouper,
|
|
|
|
|
|
DimensionIdentifier dimensionIdentifier) {
|
|
|
|
|
|
this.textGrouper = textGrouper;
|
|
|
|
|
|
this.dimensionIdentifier = dimensionIdentifier;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-28 18:09:08 +08:00
|
|
|
|
public List<DimensionResult> extractDimensions(String pdfUrl) throws IOException {
|
|
|
|
|
|
try (PDDocument document = Loader.loadPDF(getPdfBytes(pdfUrl))) {
|
|
|
|
|
|
// int totalPages = document.getNumberOfPages();
|
2026-05-06 15:49:28 +08:00
|
|
|
|
PositionedTextStripper stripper = new PositionedTextStripper();
|
|
|
|
|
|
stripper.setSortByPosition(true);
|
|
|
|
|
|
StringWriter writer = new StringWriter();
|
|
|
|
|
|
stripper.writeText(document, writer);
|
|
|
|
|
|
List<TextElement> allElements = stripper.getElements();
|
|
|
|
|
|
|
|
|
|
|
|
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
|
2026-05-28 18:09:08 +08:00
|
|
|
|
return dimensionIdentifier.identifyDimensions(groups);
|
2026-05-06 15:49:28 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-28 18:09:08 +08:00
|
|
|
|
private byte[] getPdfBytes(String pdfUrl) throws IOException {
|
2026-05-06 15:49:28 +08:00
|
|
|
|
try (InputStream in = new URL(pdfUrl).openStream();
|
|
|
|
|
|
ByteArrayOutputStream buffer = new ByteArrayOutputStream()) {
|
|
|
|
|
|
byte[] chunk = new byte[8192];
|
|
|
|
|
|
int len;
|
|
|
|
|
|
while ((len = in.read(chunk)) != -1) {
|
|
|
|
|
|
buffer.write(chunk, 0, len);
|
|
|
|
|
|
}
|
2026-05-28 18:09:08 +08:00
|
|
|
|
return buffer.toByteArray();
|
2026-05-06 15:49:28 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-28 18:09:08 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* 区域提取:直接返回区域内的原始文本组数据(经过乱码修正),
|
|
|
|
|
|
* 不经过 DimensionIdentifier 的模式匹配和符号推断逻辑。
|
|
|
|
|
|
*/
|
|
|
|
|
|
public List<DimensionResult> extractRawTextForRegion(String pdfUrl,
|
|
|
|
|
|
double rx, double ry, double rw, double rh) throws IOException {
|
|
|
|
|
|
try (PDDocument document = Loader.loadPDF(getPdfBytes(pdfUrl))) {
|
2026-05-08 09:03:02 +08:00
|
|
|
|
PositionedTextStripper stripper = new PositionedTextStripper();
|
|
|
|
|
|
stripper.setSortByPosition(true);
|
|
|
|
|
|
StringWriter writer = new StringWriter();
|
|
|
|
|
|
stripper.writeText(document, writer);
|
|
|
|
|
|
List<TextElement> allElements = stripper.getElements();
|
|
|
|
|
|
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
|
2026-05-06 15:49:28 +08:00
|
|
|
|
|
2026-05-28 18:09:08 +08:00
|
|
|
|
// 筛选落在指定区域内的 TextGroup,直接返回其文本内容
|
|
|
|
|
|
List<DimensionResult> results = new java.util.ArrayList<>();
|
|
|
|
|
|
for (TextGroup g : groups) {
|
|
|
|
|
|
String text = g.getText().trim();
|
|
|
|
|
|
if (text.isEmpty()) continue;
|
|
|
|
|
|
|
|
|
|
|
|
// 对旋转文本和水平文本分别计算边界框
|
|
|
|
|
|
double gx, gy, gw, gh;
|
|
|
|
|
|
if (g.isRotated()) {
|
|
|
|
|
|
// 旋转文本: X=列位, Width=Y跨度, Height=字符厚度
|
|
|
|
|
|
gx = g.getX();
|
|
|
|
|
|
gy = g.getY();
|
|
|
|
|
|
gw = g.getHeight(); // X方向宽度 = 字符厚度
|
|
|
|
|
|
gh = g.getWidth(); // Y方向高度 = Y跨度
|
|
|
|
|
|
} else {
|
|
|
|
|
|
gx = g.getX();
|
|
|
|
|
|
gy = g.getY() - g.getHeight();
|
|
|
|
|
|
gw = g.getWidth();
|
|
|
|
|
|
gh = g.getHeight();
|
|
|
|
|
|
}
|
|
|
|
|
|
if (gx + gw < rx || gx > rx + rw || gy + gh < ry || gy > ry + rh) continue;
|
|
|
|
|
|
|
|
|
|
|
|
// 中心点Y检查:防止大字号符号(如Φ fs=24)的边界框向上延伸过多导致误入区域。
|
|
|
|
|
|
// 仅对非旋转文本应用,旋转文本的Y跨度由文本长度决定,不受此问题影响。
|
|
|
|
|
|
if (!g.isRotated()) {
|
|
|
|
|
|
double centerY = gy + gh / 2.0;
|
|
|
|
|
|
if (gh > 0 && (centerY < ry || centerY > ry + rh)) continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 超宽边界框检查:CAD导出PDF中文本字距极大时,边界框右边缘远超实际可视范围
|
|
|
|
|
|
// 当文本宽度异常(超过字符数×字号×2)且起始位置在区域左侧外时,跳过
|
|
|
|
|
|
if (!g.isRotated()) {
|
|
|
|
|
|
double normalWidth = text.length() * g.getFontSize();
|
|
|
|
|
|
if (gw > normalWidth * 2 && gx < rx - g.getFontSize()) {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
log.debug("Region match: text=[{}] rotated={} gx={} gy={} gw={} gh={} origX={} origY={} origW={} origH={}",
|
|
|
|
|
|
text, g.isRotated(), gx, gy, gw, gh, g.getX(), g.getY(), g.getWidth(), g.getHeight());
|
|
|
|
|
|
|
|
|
|
|
|
DimensionResult r = new DimensionResult();
|
|
|
|
|
|
r.setDimension(text);
|
|
|
|
|
|
r.setTolerance(null);
|
|
|
|
|
|
r.setType("text");
|
|
|
|
|
|
r.setX(Math.round(gx * 100.0) / 100.0);
|
|
|
|
|
|
r.setY(Math.round(gy * 100.0) / 100.0);
|
|
|
|
|
|
r.setWidth(Math.round(gw * 100.0) / 100.0);
|
|
|
|
|
|
r.setHeight(Math.round(gh * 100.0) / 100.0);
|
|
|
|
|
|
r.setPage(g.getPageNum());
|
|
|
|
|
|
results.add(r);
|
|
|
|
|
|
}
|
|
|
|
|
|
return results;
|
2026-05-06 15:49:28 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|