「java取出pdf」java取出数组中的值

admin 2022-12-14 17:06:07 1188

本篇文章给大家谈谈java取出pdf，以及java取出数组中的值对应的知识点，希望对各位有所帮助，不要忘了收藏本站喔。

本文目录一览：

1、java取文件后缀名pdf什么情况会报空指针
2、java 如何读取PDF文件内容
3、用java读取pdf

java取文件后缀名pdf什么情况会报空指针

字符串变量未初始化会报空指针。

Java具有简单性、面向对象、分布式、健壮性、安全性、平台独立与可移植性、多线程、动态性等特点。Java可以编写桌面应用程序、Web应用程序、分布式系统和嵌入式系统应用程序等。

发展历程：

20世纪90年代，硬件领域出现了单片式计算机系统，这种价格低廉的系统一出现就立即引起了自动控制领域人员的注意，因为使用它可以大幅度提升消费类电子产品（如电视机顶盒、面包烤箱、移动电话等）的智能化程度。

Sun公司为了抢占市场先机，在1991年成立了一个称为Green的项目小组，帕特里克、詹姆斯·高斯林、麦克·舍林丹和其他几个工程师一起组成的工作小组在加利福尼亚州门洛帕克市沙丘路的一个小工作室里面研究开发新技术，专攻计算机在家电产品上的嵌入式应用。

java 如何读取PDF文件内容

import java.io.File;

import java.io.FileOutputStream;

import java.io.OutputStreamWriter;

import java.io.Writer;

import java.net.MalformedURLException;

import java.net.URL;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.util.PDFTextStripper;

public class PdfReader {

public void readFdf(String file) throws Exception {

// 是否排序

boolean sort = false;

// pdf文件名

String pdfFile = file;

// 输入文本文件名称

String textFile = null;

// 编码方式

String encoding = "UTF-8";

// 开始提取页数

int startPage = 1;

// 结束提取页数

int endPage = Integer.MAX_VALUE;

// 文件输入流，生成文本文件

Writer output = null;

// 内存中存储的PDF Document

PDDocument document = null;

try {

// 首先当作一个URL来装载文件，如果得到异常再从本地文件系统//去装载文件

URL url = new URL(pdfFile);

//注意参数已不是以前版本中的URL.而是File。

document = PDDocument.load(pdfFile);

// 获取PDF的文件名

String fileName = url.getFile();

// 以原来PDF的名称来命名新产生的txt文件

if (fileName.length() 4) {

File outputFile = new File(fileName.substring(0, fileName

.length() - 4)

+ ".txt");

textFile = outputFile.getName();

}

} catch (MalformedURLException e) {

// 如果作为URL装载得到异常则从文件系统装载

//注意参数已不是以前版本中的URL.而是File。

document = PDDocument.load(pdfFile);

if (pdfFile.length() 4) {

textFile = pdfFile.substring(0, pdfFile.length() - 4)

+ ".txt";

}

// 文件输入流，写入文件倒textFile

output = new OutputStreamWriter(new FileOutputStream(textFile),

encoding);

// PDFTextStripper来提取文本

PDFTextStripper stripper = null;

stripper = new PDFTextStripper();

// 设置是否排序

stripper.setSortByPosition(sort);

// 设置起始页

stripper.setStartPage(startPage);

// 设置结束页

stripper.setEndPage(endPage);

// 调用PDFTextStripper的writeText提取并输出文本

stripper.writeText(document, output);

} finally {

if (output != null) {

// 关闭输出流

output.close();

}

if (document != null) {

// 关闭PDF Document

document.close();

}

/**

* @param args

public static void main(String[] args) {

// TODO Auto-generated method stub

PdfReader pdfReader = new PdfReader();

try {

// 取得E盘下的SpringGuide.pdf的内容

pdfReader.readFdf("E://SpringGuide.pdf");

} catch (Exception e) {

e.printStackTrace();

}

用java读取pdf

可以使用PDFBOX0.7.3控件：

import java.io.InputStream;import java.io.IOException;

import org.apache.lucene.document.Document;import org.pdfbox.cos.COSDocument;

import org.pdfbox.pdfparser.PDFParser;import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.pdmodel.PDDocumentInformation;import org.pdfbox.util.PDFTextStripper;

import com.search.code.Index;

public Document getDocument(Index index, String url, String title, InputStream is)throws DocCenterException {COSDocument cosDoc = null;br/ try {cosDoc = parseDocument(is);br/ } catch (IOException e) {

closeCOSDocument(cosDoc);

throw new DocCenterException("无法处理该PDF文档", e);

}

if (cosDoc.isEncrypted()) {

if (cosDoc != null)

closeCOSDocument(cosDoc);

throw new DocCenterException("该PDF文档是加密文档，无法处理");

}

String docText = null;

try {

PDFTextStripper stripper = new PDFTextStripper();

docText = stripper.getText(new PDDocument(cosDoc));

} catch (IOException e) {

closeCOSDocument(cosDoc);

throw new DocCenterException("无法处理该PDF文档", e);

}

PDDocument pdDoc = null;

try {pdDoc = new PDDocument(cosDoc);br/ PDDocumentInformation docInfo = pdDoc.getDocumentInformation();br/ if(docInfo.getTitle()!=null !docInfo.getTitle().equals("")){br/ title = docInfo.getTitle();}

} catch (Exception e) {

closeCOSDocument(cosDoc);

closePDDocument(pdDoc);

System.err.println("无法取得该PDF文档的元数据" + e.getMessage());

} finally {

closeCOSDocument(cosDoc);

closePDDocument(pdDoc);

}

return null;

}

private static COSDocument parseDocument(InputStream is) throws IOException {

PDFParser parser = new PDFParser(is);parser.parse();return parser.getDocument();

}

private void closeCOSDocument(COSDocument cosDoc) {

if (cosDoc != null) {try {cosDoc.close();} catch (IOException e) {}

}}

private void closePDDocument(PDDocument pdDoc) {

if (pdDoc != null) {

try { pdDoc.close();

} catch (IOException e) {

}}}

java取出pdf的介绍就聊到这里吧，感谢你花时间阅读本站内容，更多关于java取出数组中的值、java取出pdf的信息别忘了在本站进行查找喔。

标签：java取出pdf