diff --git a/agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java b/agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java new file mode 100644 index 0000000..ab008d4 --- /dev/null +++ b/agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023-2025, Agents-Flex (fuhai999@gmail.com). + *
+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *
+ * http://www.apache.org/licenses/LICENSE-2.0 + *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.agentsflex.document.parser;
+
+import com.agentsflex.core.document.Document;
+import com.agentsflex.core.document.DocumentParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+public class PdfBoxDocumentParser implements DocumentParser {
+
+ /**
+ * 返回整个文档的内容
+ */
+ @Override
+ public Document parse(InputStream stream) {
+ try (PDDocument pdfDocument = PDDocument.load(stream)) {
+ PDFTextStripper stripper = new PDFTextStripper();
+ String text = stripper.getText(pdfDocument);
+ return new Document(text);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * 返回每页文档的内容
+ */
+ public List