「java获取html」java获取html指定标签

admin 2022-11-29 12:57:09 1493

今天给各位分享java获取html的知识，其中也会对java获取html指定标签进行解释，如果能碰巧解决你现在面临的问题，别忘了关注本站，现在开始吧！

本文目录一览：

1、java怎样读取html文件
2、java正则表达式获取任意两个html标签中的内容？
3、java获取html
4、java爬虫怎么动态的获取html
5、java获取html内的内容

java怎样读取html文件

java读取html文件跟读取普通文件一样，都是使用输入输出流，但是java读取html文件之后还需要解析，使用Jsoup对html进行解析。下面是一个java读取带表格的任意html文件，并把html文件转换成excel的例子。

要求：要求能够实现给出任意带table表格的html文件，生成与表格相同内容的excel文件，附件可以作为测试文件，提供给定的roster.html文件，通过java代码，实现生成与html页面的table相同样式的roster.xls文件。

首先看roster.html:

java代码：

import java.io.BufferedReader;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

import jxl.Workbook;

import jxl.write.Label;

import jxl.write.WritableCellFormat;

import jxl.write.WritableFont;

import jxl.write.WritableSheet;

import jxl.write.WritableWorkbook;

import jxl.write.WriteException;

import jxl.write.biff.RowsExceededException;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

public class HTMLTOExcel {

public static void main(String args[]) throws IOException{

///读取classpath目录下面的路径

String path=HTMLTOExcel.class.getResource("/").getPath();

path+="roster.html";

toExcel(path,"roster");

}

//得到Document并且设置编码格式

public static Document getDoc(String fileName) throws IOException{

File myFile=new File(fileName);

Document doc= Jsoup.parse(myFile, "GBK","");

return doc;

}

///这个方法用于根据trs行数和sheet画出整个表格

public static void mergeColRow(Elements trs,WritableSheet sheet) throws RowsExceededException, WriteException{

int[][] rowhb=new int[300][50];

for(int i=0;itrs.size();i++){

Element tr=trs.get(i);

Elements tds=tr.getElementsByTag("td");

int realColNum=0;

for(int j=0;jtds.size();j++){

Element td=tds.get(j);

if(rowhb[i][realColNum]!=0){

realColNum=getRealColNum(rowhb,i,realColNum);

}

int rowspan=1;

int colspan=1;

if(td.attr("rowspan")!=""){

rowspan = Integer.parseInt(td.attr("rowspan"));

}

if(td.attr("colspan")!=""){

colspan = Integer.parseInt(td.attr("colspan"));

}

String text=td.text();

drawMegerCell(rowspan,colspan,sheet,realColNum,i,text,rowhb);

realColNum=realColNum+colspan;

}

///这个方法用于根据样式画出单元格，并且根据rowpan和colspan合并单元格

public static void drawMegerCell(int rowspan,int colspan,WritableSheet sheet,int realColNum,int realRowNum,String text,int[][] rowhb) throws RowsExceededException, WriteException{

for(int i=0;irowspan;i++){

for(int j=0;jcolspan;j++){

if(i!=0||j!=0){

text="";

}

Label label = new Label(realColNum+j,realRowNum+i,text);

WritableFont countents = new WritableFont(WritableFont.TIMES,10); // 设置单元格内容，字号12

WritableCellFormat cellf = new WritableCellFormat(countents );

cellf.setAlignment(jxl.format.Alignment.CENTRE);//把水平对齐方式指定为居中

cellf.setVerticalAlignment(jxl.format.VerticalAlignment.CENTRE);//把垂直对齐方式指定为居

label.setCellFormat(cellf);

sheet.addCell(label);

rowhb[realRowNum+i][realColNum+j]=1;

}

sheet.mergeCells(realColNum,realRowNum, realColNum+colspan-1,realRowNum+rowspan-1);

}

public static int getRealColNum(int[][] rowhb,int i,int realColNum){

while(rowhb[i][realColNum]!=0){

realColNum++;

}

return realColNum;

}

///根据colgroups设置表格的列宽

public static void setColWidth(Elements colgroups,WritableSheet sheet){

if(colgroups.size()0){

Element colgroup=colgroups.get(0);

Elements cols=colgroup.getElementsByTag("col");

for(int i=0;icols.size();i++){

Element col=cols.get(i);

String strwd=col.attr("width");

if(col.attr("width")!=""){

int wd=Integer.parseInt(strwd);

sheet.setColumnView(i,wd/8);

}

//toExcel是根据html文件地址生成对应的xls

public static void toExcel(String fileName,String excelName)throws IOException{

Document doc=getDoc(fileName);

String title = doc.title();

///得到样式，以后可以根据正则表达式解析css，暂且没有找到cssparse

Elements style= doc.getElementsByTag("style");

///得到Table，demo只演示输入一个table，以后可以用循环遍历tables集合输入所有table

Elements tables= doc.getElementsByTag("TABLE");

if(tables.size()==0){

return;

}

Element table=tables.get(0);

//得到所有行

Elements trs = table.getElementsByTag("tr");

///得到列宽集合

Elements colgroups=table.getElementsByTag("colgroup");

try {

//文件保存到classpath目录下面

String path=HTMLTOExcel.class.getResource("/").getPath();

path+=excelName+".xls";

System.out.println(path);

WritableWorkbook book = Workbook.createWorkbook(new File(path));

WritableSheet sheet = book.createSheet("人事关系", 0);

setColWidth(colgroups,sheet);

mergeColRow(trs,sheet);

book.write();

book.close();

} catch (RowsExceededException e) {

e.printStackTrace();

} catch (WriteException e) {

e.printStackTrace();

}

解析html文件的例子文档地址：

java正则表达式获取任意两个html标签中的内容？

import java.util.regex.*;

public class RegexExample {

public static void main(String []args) {

String content=获取的网页内容;

Pattern p=Pattern.compile("div class=\"fliter_px\"([\\s\\S]+?)div class=\"search_page\"");

Matcher m=p.matcher(content);

if(m.find()){

System.out.println(m.group(1));

}

java获取html

Java访问网络url，获取网页的html代码

方式一：

一是使用URL类的openStream()方法：

openStream()方法与制定的URL建立连接并返回InputStream类的对象，以从这一连接中读取数据；

openStream()方法只能读取网络资源。

二是使用URL类的openConnection()方法：

openConnection()方法会创建一个URLConnection类的对象，此对象在本地机和URL指定的远程节点建立一条HTTP协议的数据通道，可进行双向数据传输。类URLConnection提供了很多设置和获取连接参数的方法，最常用到的是getInputStream()和getOutputStream()方法。

openConnection()方法既能读取又能发送数据。

列如：

public static void main(String args[]) throws Exception {

try {

//输入url路径

URL url = new URL("url路径"); InputStream in =url.openStream(); InputStreamReader isr = new InputStreamReader(in); BufferedReader bufr = new BufferedReader(isr); String str; while ((str = bufr.readLine()) != null) { System.out.println(str); } bufr.close(); isr.close(); in.close(); } catch (Exception e) { e.printStackTrace(); } }

java爬虫怎么动态的获取html

package httpclient_learn;import java.io.IOException;import org.apache.http.HttpEntity;import org.apache.http.HttpStatus;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.utils.HttpClientUtils;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;public class HttpClientTest {

public static void main(String[] args) { //1.生成httpclient，相当于该打开一个浏览器

CloseableHttpClient httpClient = HttpClients.createDefault();

CloseableHttpResponse response = null; //2.创建get请求，相当于在浏览器地址栏输入网址

HttpGet request = new HttpGet(""); try { //3.执行get请求，相当于在输入地址栏后敲回车键

response = httpClient.execute(request);

//4.判断响应状态为200，进行处理

if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { //5.获取响应内容

HttpEntity httpEntity = response.getEntity();

String html = EntityUtils.toString(httpEntity, "utf-8");

System.out.println(html);

} else { //如果返回状态不是200，比如404（页面不存在）等，根据情况做处理，这里略

System.out.println("返回状态不是200");

System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));

}

} catch (ClientProtocolException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

} finally { //6.关闭 HttpClientUtils.closeQuietly(response);

HttpClientUtils.closeQuietly(httpClient);

}

java获取html内的内容

简单实现：

HtmlRequest类的内容：

[java] view plaincopy

package com.capinfotech.net;

import java.io.ByteArrayOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URL;

public class HtmlRequest {

public static void main(String[] args) throws IOException {

URL url = new URL("");

HttpURLConnection conn = (HttpURLConnection)url.openConnection();

InputStream inputStream = conn.getInputStream(); //通过输入流获得网站数据

byte[] getData = readInputStream(inputStream); //获得网站的二进制数据

String data = new String(getData, "gb2312");

System.out.println(data);

}

public static byte[] readInputStream(InputStream inputStream) throws IOException {

byte[] buffer = new byte[1024];

int len = 0;

ByteArrayOutputStream bos = new ByteArrayOutputStream();

while((len = inputStream.read(buffer)) != -1) {

bos.write(buffer, 0, len);

}

bos.close();

return bos.toByteArray();

}

这样就能获得的内容，在控制台会打印输出

java获取html的介绍就聊到这里吧，感谢你花时间阅读本站内容，更多关于java获取html指定标签、java获取html的信息别忘了在本站进行查找喔。

标签：java获取html

本文目录一览：

java怎样读取html文件

java正则表达式获取任意两个html标签中的内容？

java获取html

java爬虫 怎么动态的获取html

java获取html内的内容

java爬虫怎么动态的获取html