wms/nflg-qms-pdf-extract/src/main/java/service/PdfExtractionService.java

121 lines
4.6 KiB
Java
Raw Normal View History

package service;
import extraction.DimensionIdentifier;
import extraction.PositionedTextStripper;
import extraction.TextGrouper;
import lombok.extern.slf4j.Slf4j;
import model.DimensionResult;
import model.TextElement;
import model.TextGroup;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
@Slf4j
public class PdfExtractionService {
private final TextGrouper textGrouper;
private final DimensionIdentifier dimensionIdentifier;
public PdfExtractionService(TextGrouper textGrouper,
DimensionIdentifier dimensionIdentifier) {
this.textGrouper = textGrouper;
this.dimensionIdentifier = dimensionIdentifier;
}
public ExtractionResult extractDimensions(Path pdfPath, String fileId) throws IOException {
long start = System.currentTimeMillis();
File file = pdfPath.toFile();
try (PDDocument document = Loader.loadPDF(file)) {
int totalPages = document.getNumberOfPages();
PositionedTextStripper stripper = new PositionedTextStripper();
stripper.setSortByPosition(true);
StringWriter writer = new StringWriter();
stripper.writeText(document, writer);
List<TextElement> allElements = stripper.getElements();
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
List<DimensionResult> dimensions = dimensionIdentifier.identifyDimensions(groups);
long elapsed = System.currentTimeMillis() - start;
log.info("Extracted {} dimensions from {} in {}ms", dimensions.size(), pdfPath.getFileName(), elapsed);
return new ExtractionResult(dimensions, totalPages);
}
}
/**
* 区域提取不限公差过滤包含所有尺寸不保存到数据库
*/
public List<DimensionResult> extractAllDimensionsForRegion(String pdfUrl) throws IOException {
byte[] pdfBytes;
try (InputStream in = new URL(pdfUrl).openStream();
ByteArrayOutputStream buffer = new ByteArrayOutputStream()) {
byte[] chunk = new byte[8192];
int len;
while ((len = in.read(chunk)) != -1) {
buffer.write(chunk, 0, len);
}
pdfBytes = buffer.toByteArray();
}
try (PDDocument document = Loader.loadPDF(pdfBytes)) {
PositionedTextStripper stripper = new PositionedTextStripper();
stripper.setSortByPosition(true);
StringWriter writer = new StringWriter();
stripper.writeText(document, writer);
List<TextElement> allElements = stripper.getElements();
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
return dimensionIdentifier.identifyDimensions(groups, false);
}
}
public List<DimensionResult> extractDimensions(String pdfUrl) throws IOException {
byte[] pdfBytes;
try (InputStream in = new URL(pdfUrl).openStream();
ByteArrayOutputStream buffer = new ByteArrayOutputStream()) {
byte[] chunk = new byte[8192];
int len;
while ((len = in.read(chunk)) != -1) {
buffer.write(chunk, 0, len);
}
pdfBytes = buffer.toByteArray();
}
try (PDDocument document = Loader.loadPDF(pdfBytes)) {
int totalPages = document.getNumberOfPages();
PositionedTextStripper stripper = new PositionedTextStripper();
stripper.setSortByPosition(true);
StringWriter writer = new StringWriter();
stripper.writeText(document, writer);
List<TextElement> allElements = stripper.getElements();
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
return dimensionIdentifier.identifyDimensions(groups);
}
}
public static class ExtractionResult {
private final List<DimensionResult> dimensions;
private final int totalPages;
public ExtractionResult(List<DimensionResult> dimensions, int totalPages) {
this.dimensions = dimensions;
this.totalPages = totalPages;
}
public List<DimensionResult> getDimensions() { return dimensions; }
public int getTotalPages() { return totalPages; }
}
}