From 3a41b3a19802b835b54162185752f65f5694a7a2 Mon Sep 17 00:00:00 2001 From: 0007 <0007@qq.com> Date: Wed, 27 Aug 2025 19:59:16 +0800 Subject: [PATCH] Add File --- .../document/parser/PdfBoxDocumentParser.java | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java diff --git a/agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java b/agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java new file mode 100644 index 0000000..ab008d4 --- /dev/null +++ b/agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023-2025, Agents-Flex (fuhai999@gmail.com). + *
+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *
+ * http://www.apache.org/licenses/LICENSE-2.0 + *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.agentsflex.document.parser;
+
+import com.agentsflex.core.document.Document;
+import com.agentsflex.core.document.DocumentParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+public class PdfBoxDocumentParser implements DocumentParser {
+
+ /**
+ * 返回整个文档的内容
+ */
+ @Override
+ public Document parse(InputStream stream) {
+ try (PDDocument pdfDocument = PDDocument.load(stream)) {
+ PDFTextStripper stripper = new PDFTextStripper();
+ String text = stripper.getText(pdfDocument);
+ return new Document(text);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * 返回每页文档的内容
+ */
+ public List