feat(pdf-extract): 新增PDF尺寸区域提取功能
- 在PdfExtractionService中实现从PDF URL中提取尺寸数据 - 通过URL读取PDF字节流并加载文档进行文本位置解析 - 利用文本分组和维度识别器提取尺寸信息列表 - 在QmsInspectionStandardController添加GET接口,支持通过URL参数提取尺寸数据 - 在QmsInspectionStandardControllerService中添加对应服务方法调用pdfExtractionService接口
This commit is contained in:
parent
f2901111b3
commit
9eccd96b00
|
|
@ -13,6 +13,7 @@ import jakarta.annotation.Resource;
|
|||
import jakarta.validation.Valid;
|
||||
import jakarta.validation.constraints.NotNull;
|
||||
import model.DimensionResult;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.validation.annotation.Validated;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
|
||||
|
|
@ -106,4 +107,12 @@ public class QmsInspectionStandardController extends BaseController {
|
|||
public ApiResult<List<DimensionResult>> pdfExtractRegion(@Valid @RequestBody QmsPdfExtractRegionQO request) throws IOException {
|
||||
return ApiResult.success(inspectionStandardControllerService.pdfExtractRegion(request));
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取PDF文件的所有区域数据
|
||||
*/
|
||||
@GetMapping("/pdf/dimensions")
|
||||
public ApiResult<List<DimensionResult>> extractDimensions(@RequestParam String url) throws IOException {
|
||||
return ApiResult.success(inspectionStandardControllerService.extractDimensions(url));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -713,4 +713,8 @@ public class QmsInspectionStandardControllerService {
|
|||
);
|
||||
return regionFilterService.mergeRegionResults(filtered);
|
||||
}
|
||||
|
||||
public List<DimensionResult> extractDimensions(String url) throws IOException {
|
||||
return pdfExtractionService.extractDimensions(url);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
|
@ -80,6 +81,30 @@ public class PdfExtractionService {
|
|||
}
|
||||
}
|
||||
|
||||
public List<DimensionResult> extractDimensions(String pdfUrl) throws IOException {
|
||||
byte[] pdfBytes;
|
||||
try (InputStream in = new URL(pdfUrl).openStream();
|
||||
ByteArrayOutputStream buffer = new ByteArrayOutputStream()) {
|
||||
byte[] chunk = new byte[8192];
|
||||
int len;
|
||||
while ((len = in.read(chunk)) != -1) {
|
||||
buffer.write(chunk, 0, len);
|
||||
}
|
||||
pdfBytes = buffer.toByteArray();
|
||||
}
|
||||
try (PDDocument document = Loader.loadPDF(pdfBytes)) {
|
||||
int totalPages = document.getNumberOfPages();
|
||||
PositionedTextStripper stripper = new PositionedTextStripper();
|
||||
stripper.setSortByPosition(true);
|
||||
StringWriter writer = new StringWriter();
|
||||
stripper.writeText(document, writer);
|
||||
List<TextElement> allElements = stripper.getElements();
|
||||
|
||||
List<TextGroup> groups = textGrouper.groupTextElements(allElements);
|
||||
return dimensionIdentifier.identifyDimensions(groups);
|
||||
}
|
||||
}
|
||||
|
||||
public static class ExtractionResult {
|
||||
private final List<DimensionResult> dimensions;
|
||||
private final int totalPages;
|
||||
|
|
|
|||
Loading…
Reference in New Issue