package service; import extraction.DimensionIdentifier; import extraction.PositionedTextStripper; import extraction.TextGrouper; import lombok.extern.slf4j.Slf4j; import model.DimensionResult; import model.TextElement; import model.TextGroup; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Path; import java.util.List; import java.util.stream.Collectors; @Slf4j public class PdfExtractionService { private final TextGrouper textGrouper; private final DimensionIdentifier dimensionIdentifier; public PdfExtractionService(TextGrouper textGrouper, DimensionIdentifier dimensionIdentifier) { this.textGrouper = textGrouper; this.dimensionIdentifier = dimensionIdentifier; } public ExtractionResult extractDimensions(Path pdfPath, String fileId) throws IOException { long start = System.currentTimeMillis(); File file = pdfPath.toFile(); try (PDDocument document = Loader.loadPDF(file)) { int totalPages = document.getNumberOfPages(); PositionedTextStripper stripper = new PositionedTextStripper(); stripper.setSortByPosition(true); StringWriter writer = new StringWriter(); stripper.writeText(document, writer); List allElements = stripper.getElements(); List groups = textGrouper.groupTextElements(allElements); List dimensions = dimensionIdentifier.identifyDimensions(groups); long elapsed = System.currentTimeMillis() - start; log.info("Extracted {} dimensions from {} in {}ms", dimensions.size(), pdfPath.getFileName(), elapsed); return new ExtractionResult(dimensions, totalPages); } } /** * 区域提取:不限公差过滤,包含所有尺寸,不保存到数据库 */ public List extractAllDimensionsForRegion(String pdfUrl) throws IOException { byte[] pdfBytes; try (InputStream in = new URL(pdfUrl).openStream(); ByteArrayOutputStream buffer = new ByteArrayOutputStream()) { byte[] chunk = new byte[8192]; int len; while ((len = in.read(chunk)) != -1) { buffer.write(chunk, 0, len); } pdfBytes = buffer.toByteArray(); } try (PDDocument document = Loader.loadPDF(pdfBytes)) { PositionedTextStripper stripper = new PositionedTextStripper(); stripper.setSortByPosition(true); StringWriter writer = new StringWriter(); stripper.writeText(document, writer); List allElements = stripper.getElements(); List groups = textGrouper.groupTextElements(allElements); return dimensionIdentifier.identifyDimensions(groups, false); } } public List extractDimensions(String pdfUrl) throws IOException { byte[] pdfBytes; try (InputStream in = new URL(pdfUrl).openStream(); ByteArrayOutputStream buffer = new ByteArrayOutputStream()) { byte[] chunk = new byte[8192]; int len; while ((len = in.read(chunk)) != -1) { buffer.write(chunk, 0, len); } pdfBytes = buffer.toByteArray(); } try (PDDocument document = Loader.loadPDF(pdfBytes)) { int totalPages = document.getNumberOfPages(); PositionedTextStripper stripper = new PositionedTextStripper(); stripper.setSortByPosition(true); StringWriter writer = new StringWriter(); stripper.writeText(document, writer); List allElements = stripper.getElements(); List groups = textGrouper.groupTextElements(allElements); return dimensionIdentifier.identifyDimensions(groups); } } public static class ExtractionResult { private final List dimensions; private final int totalPages; public ExtractionResult(List dimensions, int totalPages) { this.dimensions = dimensions; this.totalPages = totalPages; } public List getDimensions() { return dimensions; } public int getTotalPages() { return totalPages; } } }