wms/nflg-qms-pdf-extract/src/main/java/service/PdfExtractionService.java

132 lines
5.6 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package service;
import extraction.DimensionIdentifier;
import extraction.PositionedTextStripper;
import extraction.TextGrouper;
import lombok.extern.slf4j.Slf4j;
import model.DimensionResult;
import model.TextElement;
import model.TextGroup;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
@Slf4j
public class PdfExtractionService {
private final TextGrouper textGrouper;
private final DimensionIdentifier dimensionIdentifier;
public PdfExtractionService(TextGrouper textGrouper,
DimensionIdentifier dimensionIdentifier) {
this.textGrouper = textGrouper;
this.dimensionIdentifier = dimensionIdentifier;
}
public List<DimensionResult> extractDimensions(String pdfUrl) throws IOException {
try (PDDocument document = Loader.loadPDF(getPdfBytes(pdfUrl))) {
// int totalPages = document.getNumberOfPages();
PositionedTextStripper stripper = new PositionedTextStripper();
stripper.setSortByPosition(true);
StringWriter writer = new StringWriter();
stripper.writeText(document, writer);
List<TextElement> allElements = stripper.getElements();
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
return dimensionIdentifier.identifyDimensions(groups);
}
}
private byte[] getPdfBytes(String pdfUrl) throws IOException {
try (InputStream in = new URL(pdfUrl).openStream();
ByteArrayOutputStream buffer = new ByteArrayOutputStream()) {
byte[] chunk = new byte[8192];
int len;
while ((len = in.read(chunk)) != -1) {
buffer.write(chunk, 0, len);
}
return buffer.toByteArray();
}
}
/**
* 区域提取:直接返回区域内的原始文本组数据(经过乱码修正),
* 不经过 DimensionIdentifier 的模式匹配和符号推断逻辑。
*/
public List<DimensionResult> extractRawTextForRegion(String pdfUrl,
double rx, double ry, double rw, double rh) throws IOException {
try (PDDocument document = Loader.loadPDF(getPdfBytes(pdfUrl))) {
PositionedTextStripper stripper = new PositionedTextStripper();
stripper.setSortByPosition(true);
StringWriter writer = new StringWriter();
stripper.writeText(document, writer);
List<TextElement> allElements = stripper.getElements();
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
// 筛选落在指定区域内的 TextGroup直接返回其文本内容
List<DimensionResult> results = new java.util.ArrayList<>();
for (TextGroup g : groups) {
String text = g.getText().trim();
if (text.isEmpty()) continue;
// 对旋转文本和水平文本分别计算边界框
double gx, gy, gw, gh;
if (g.isRotated()) {
// 旋转文本: X=列位, Width=Y跨度, Height=字符厚度
gx = g.getX();
gy = g.getY();
gw = g.getHeight(); // X方向宽度 = 字符厚度
gh = g.getWidth(); // Y方向高度 = Y跨度
} else {
gx = g.getX();
gy = g.getY() - g.getHeight();
gw = g.getWidth();
gh = g.getHeight();
}
if (gx + gw < rx || gx > rx + rw || gy + gh < ry || gy > ry + rh) continue;
// 中心点Y检查防止大字号符号如Φ fs=24的边界框向上延伸过多导致误入区域。
// 仅对非旋转文本应用旋转文本的Y跨度由文本长度决定不受此问题影响。
if (!g.isRotated()) {
double centerY = gy + gh / 2.0;
if (gh > 0 && (centerY < ry || centerY > ry + rh)) continue;
}
// 超宽边界框检查CAD导出PDF中文本字距极大时边界框右边缘远超实际可视范围
// 当文本宽度异常超过字符数×字号×2且起始位置在区域左侧外时跳过
if (!g.isRotated()) {
double normalWidth = text.length() * g.getFontSize();
if (gw > normalWidth * 2 && gx < rx - g.getFontSize()) {
continue;
}
}
log.debug("Region match: text=[{}] rotated={} gx={} gy={} gw={} gh={} origX={} origY={} origW={} origH={}",
text, g.isRotated(), gx, gy, gw, gh, g.getX(), g.getY(), g.getWidth(), g.getHeight());
DimensionResult r = new DimensionResult();
r.setDimension(text);
r.setTolerance(null);
r.setType("text");
r.setX(Math.round(gx * 100.0) / 100.0);
r.setY(Math.round(gy * 100.0) / 100.0);
r.setWidth(Math.round(gw * 100.0) / 100.0);
r.setHeight(Math.round(gh * 100.0) / 100.0);
r.setPage(g.getPageNum());
results.add(r);
}
return results;
}
}
}