2026-05-06 15:49:28 +08:00
|
|
|
|
package service;
|
|
|
|
|
|
|
|
|
|
|
|
import extraction.DimensionIdentifier;
|
|
|
|
|
|
import extraction.PositionedTextStripper;
|
|
|
|
|
|
import extraction.TextGrouper;
|
|
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
|
|
import model.DimensionResult;
|
|
|
|
|
|
import model.TextElement;
|
|
|
|
|
|
import model.TextGroup;
|
|
|
|
|
|
import org.apache.pdfbox.Loader;
|
|
|
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
|
|
|
|
|
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
|
import java.io.InputStream;
|
|
|
|
|
|
import java.io.StringWriter;
|
2026-05-08 09:03:02 +08:00
|
|
|
|
import java.net.MalformedURLException;
|
2026-05-06 15:49:28 +08:00
|
|
|
|
import java.net.URL;
|
|
|
|
|
|
import java.nio.file.Path;
|
|
|
|
|
|
import java.util.List;
|
|
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
|
|
|
|
@Slf4j
|
|
|
|
|
|
public class PdfExtractionService {
|
|
|
|
|
|
|
|
|
|
|
|
private final TextGrouper textGrouper;
|
|
|
|
|
|
private final DimensionIdentifier dimensionIdentifier;
|
|
|
|
|
|
|
|
|
|
|
|
public PdfExtractionService(TextGrouper textGrouper,
|
|
|
|
|
|
DimensionIdentifier dimensionIdentifier) {
|
|
|
|
|
|
this.textGrouper = textGrouper;
|
|
|
|
|
|
this.dimensionIdentifier = dimensionIdentifier;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public ExtractionResult extractDimensions(Path pdfPath, String fileId) throws IOException {
|
|
|
|
|
|
long start = System.currentTimeMillis();
|
|
|
|
|
|
File file = pdfPath.toFile();
|
|
|
|
|
|
|
|
|
|
|
|
try (PDDocument document = Loader.loadPDF(file)) {
|
|
|
|
|
|
int totalPages = document.getNumberOfPages();
|
|
|
|
|
|
|
|
|
|
|
|
PositionedTextStripper stripper = new PositionedTextStripper();
|
|
|
|
|
|
stripper.setSortByPosition(true);
|
|
|
|
|
|
StringWriter writer = new StringWriter();
|
|
|
|
|
|
stripper.writeText(document, writer);
|
|
|
|
|
|
List<TextElement> allElements = stripper.getElements();
|
|
|
|
|
|
|
|
|
|
|
|
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
|
|
|
|
|
|
List<DimensionResult> dimensions = dimensionIdentifier.identifyDimensions(groups);
|
|
|
|
|
|
|
|
|
|
|
|
long elapsed = System.currentTimeMillis() - start;
|
|
|
|
|
|
log.info("Extracted {} dimensions from {} in {}ms", dimensions.size(), pdfPath.getFileName(), elapsed);
|
|
|
|
|
|
|
|
|
|
|
|
return new ExtractionResult(dimensions, totalPages);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* 区域提取:不限公差过滤,包含所有尺寸,不保存到数据库
|
|
|
|
|
|
*/
|
|
|
|
|
|
public List<DimensionResult> extractAllDimensionsForRegion(String pdfUrl) throws IOException {
|
|
|
|
|
|
byte[] pdfBytes;
|
|
|
|
|
|
try (InputStream in = new URL(pdfUrl).openStream();
|
|
|
|
|
|
ByteArrayOutputStream buffer = new ByteArrayOutputStream()) {
|
|
|
|
|
|
byte[] chunk = new byte[8192];
|
|
|
|
|
|
int len;
|
|
|
|
|
|
while ((len = in.read(chunk)) != -1) {
|
|
|
|
|
|
buffer.write(chunk, 0, len);
|
|
|
|
|
|
}
|
|
|
|
|
|
pdfBytes = buffer.toByteArray();
|
|
|
|
|
|
}
|
|
|
|
|
|
try (PDDocument document = Loader.loadPDF(pdfBytes)) {
|
|
|
|
|
|
PositionedTextStripper stripper = new PositionedTextStripper();
|
|
|
|
|
|
stripper.setSortByPosition(true);
|
|
|
|
|
|
StringWriter writer = new StringWriter();
|
|
|
|
|
|
stripper.writeText(document, writer);
|
|
|
|
|
|
List<TextElement> allElements = stripper.getElements();
|
|
|
|
|
|
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
|
|
|
|
|
|
return dimensionIdentifier.identifyDimensions(groups, false);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-08 09:03:02 +08:00
|
|
|
|
public List<DimensionResult> extractDimensions(String pdfUrl) throws IOException {
|
|
|
|
|
|
byte[] pdfBytes;
|
|
|
|
|
|
try (InputStream in = new URL(pdfUrl).openStream();
|
|
|
|
|
|
ByteArrayOutputStream buffer = new ByteArrayOutputStream()) {
|
|
|
|
|
|
byte[] chunk = new byte[8192];
|
|
|
|
|
|
int len;
|
|
|
|
|
|
while ((len = in.read(chunk)) != -1) {
|
|
|
|
|
|
buffer.write(chunk, 0, len);
|
|
|
|
|
|
}
|
|
|
|
|
|
pdfBytes = buffer.toByteArray();
|
|
|
|
|
|
}
|
|
|
|
|
|
try (PDDocument document = Loader.loadPDF(pdfBytes)) {
|
|
|
|
|
|
int totalPages = document.getNumberOfPages();
|
|
|
|
|
|
PositionedTextStripper stripper = new PositionedTextStripper();
|
|
|
|
|
|
stripper.setSortByPosition(true);
|
|
|
|
|
|
StringWriter writer = new StringWriter();
|
|
|
|
|
|
stripper.writeText(document, writer);
|
|
|
|
|
|
List<TextElement> allElements = stripper.getElements();
|
|
|
|
|
|
|
|
|
|
|
|
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
|
|
|
|
|
|
return dimensionIdentifier.identifyDimensions(groups);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-06 15:49:28 +08:00
|
|
|
|
public static class ExtractionResult {
|
|
|
|
|
|
private final List<DimensionResult> dimensions;
|
|
|
|
|
|
private final int totalPages;
|
|
|
|
|
|
|
|
|
|
|
|
public ExtractionResult(List<DimensionResult> dimensions, int totalPages) {
|
|
|
|
|
|
this.dimensions = dimensions;
|
|
|
|
|
|
this.totalPages = totalPages;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public List<DimensionResult> getDimensions() { return dimensions; }
|
|
|
|
|
|
public int getTotalPages() { return totalPages; }
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|