使用jsoup抓取分页的问题

发布网友发布时间：2022-04-21 03:14

共5个回答

懂视网时间：2022-04-21 07:35

需要使用的是jsoup-1.7.3.jar包如果需要看文档我下载请借一步到官网　　

这里贴一下我用到的 Java工程的测试代码　

package com.javen.Jsoup;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
 static String url="http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html";
 /**
 * @param args
 * @throws Exception
 */
 public static void main(String[] args) throws Exception {
 
 // TODO Auto-generated method stub
 BolgBody();
 //test();
 //Blog();
 /*
  * Document doc = Jsoup.connect("http://www.oschina.net/")
  * .data("query", "Java") // 请求参数 .userAgent("I ’ m jsoup") // 设置
  * User-Agent .cookie("auth", "token") // 设置 cookie .timeout(3000) //
  * 设置连接超时时间 .post();
  */// 使用 POST 方法访问 URL

 /*
  * // 从文件中加载 HTML 文档 File input = new File("D:/test.html"); Document doc
  * = Jsoup.parse(input,"UTF-8","http://www.oschina.net/");
  */
 }

 /**
 * 获取指定HTML 文档指定的body
 * @throws IOException
 */
 private static void BolgBody() throws IOException {
 // 直接从字符串中输入 HTML 文档
 String html = "<html><head><title> 开源中国社区 </title></head>"
  + "<body><p> 这里是 jsoup 项目的相关文章 </p></body></html>";
 Document doc = Jsoup.parse(html);
 System.out.println(doc.body());
 
 
 // 从 URL 直接加载 HTML 文档
 Document doc2 = Jsoup.connect(url).get();
 String title = doc2.body().toString();
 System.out.println(title);
 }

 /**
 * 获取博客上的文章标题和链接
 */
 public static void article() {
 Document doc;
 try {
  doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/").get();
  Elements ListDiv = doc.getElementsByAttributeValue("class","postTitle");
  for (Element element :ListDiv) {
  Elements links = element.getElementsByTag("a");
  for (Element link : links) {
   String linkHref = link.attr("href");
   String linkText = link.text().trim();
   System.out.println(linkHref);
   System.out.println(linkText);
  }
  }
 } catch (IOException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
 }

 }
 /**
 * 获取指定博客文章的内容
 */
 public static void Blog() {
 Document doc;
 try {
  doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html").get();
  Elements ListDiv = doc.getElementsByAttributeValue("class","postBody");
  for (Element element :ListDiv) {
  System.out.println(element.html());
  }
 } catch (IOException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
 }
 
 }

}

下面来介绍android中使用Jsoup异步解析网页的数据请注意：这里很容易遇到一个乱码的稳定

配置文件：AndroidManifest.xml中加权限

<uses-permission android:name="android.permission.INTERNET"></uses-permission>

layout的布局文件

<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
 xmlns:tools="http://schemas.android.com/tools"
 android:layout_width="match_parent"
 android:layout_height="match_parent"
 android:orientation="vertical" >

 <WebView
 android:id="@+id/webView"
 android:layout_width="fill_parent"
 android:layout_height="200dp" />

 <ScrollView
 android:layout_width="wrap_content"
 android:layout_height="wrap_content" >

 <TextView
  android:id="@+id/textView"
  android:layout_width="wrap_content"
  android:layout_height="wrap_content"
  android:text="@string/hello_world" />
 </ScrollView>

</LinearLayout>

主要异步加载数据的代码

package com.javen.aaa;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import android.app.Activity;
import android.app.Dialog;
import android.app.ProgressDialog;
import android.os.AsyncTask;
import android.os.Bundle;
import android.util.Log;
import android.webkit.WebView;
import android.widget.TextView;

public class MainActivity extends Activity {
 private WebView webView;
 private TextView textView;
 private static final int DIALOG_KEY = 0;
 @Override
 protected void onCreate(Bundle savedInstanceState) {
 super.onCreate(savedInstanceState);
 setContentView(R.layout.main);
 webView = (WebView) findViewById(R.id.webView);
 textView=(TextView) findViewById(R.id.textView);
 try {
  ProgressAsyncTask asyncTask=new ProgressAsyncTask(webView,textView);
  asyncTask.execute(10000);
 } catch (Exception e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
 }
 }
 
 public String test() {
 StringBuffer buffer=new StringBuffer();
 Document doc;
 try {
  doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/").get();
  Elements ListDiv = doc.getElementsByAttributeValue("class","postTitle");
  for (Element element :ListDiv) {
  Elements links = element.getElementsByTag("a");
  for (Element link : links) {
   String linkHref = link.attr("href");
   String linkText = link.text().trim();
   buffer.append("linkHref=="+linkHref);
   buffer.append("linkText=="+linkText);
   
   System.out.println(linkHref);
   System.out.println(linkText);
  }
  }
 } catch (IOException e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
 }
 return buffer.toString();

 }

 // 弹出"查看"对话框
 @Override
 protected Dialog onCreateDialog(int id) {
  switch (id) {
  case DIALOG_KEY: {
  ProgressDialog dialog = new ProgressDialog(this);
  dialog.setMessage("获取数据中 请稍候...");
  dialog.setIndeterminate(true);
  dialog.setCancelable(true);
  return dialog;
  }
  }
  return null;
 }
 
 public static String readHtml(String myurl) {
  StringBuffer sb = new StringBuffer("");
  URL url;
  try {
  url = new URL(myurl);
  BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream(), "gbk"));
  String s = "";
  while ((s = br.readLine()) != null) {
   sb.append(s + "
");
  }
  } catch (Exception e) {
  e.printStackTrace();
  }
  return sb.toString();
 }
 
 class ProgressAsyncTask extends AsyncTask<Integer, Integer, String> {

 private WebView webView;
 private TextView textView;
 public ProgressAsyncTask(WebView webView,TextView textView) {
  super();
  this.webView=webView;
  this.textView=textView;
 }

 /**
  * 这里的Integer参数对应AsyncTask中的第一个参数 这里的String返回值对应AsyncTask的第三个参数
  * 该方法并不运行在UI线程当中，主要用于异步操作，所有在该方法中不能对UI当中的空间进行设置和修改
  * 但是可以调用publish Progress方法触发onProgressUpdate对UI进行操作
  */
 @Override
 protected String doInBackground(Integer... params) {
  String str =null;
  Document doc = null;
  try {
//  String url ="http://www.cnblogs.com/zyw-205520/p/3355681.html";
//  
//  doc= Jsoup.parse(new URL(url).openStream(),"utf-8", url);
//  //doc = Jsoup.parse(readHtml(url));
//  //doc=Jsoup.connect(url).get();
//  str=doc.body().toString();
  doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html").get();
  Elements ListDiv = doc.getElementsByAttributeValue("class","postBody");
  for (Element element :ListDiv) {
   str=element.html();
   System.out.println(element.html());
  }
  Log.d("doInBackground", str.toString());
  System.out.println(str);
  //你可以试试GBK或UTF-8
  } catch (Exception e) {
  // TODO Auto-generated catch block
  e.printStackTrace();
  }
  return str.toString() ;
  //return test();
 }

 /**
  * 这里的String参数对应AsyncTask中的第三个参数（也就是接收doInBackground的返回值）
  * 在doInBackground方法执行结束之后在运行，并且运行在UI线程当中 可以对UI空间进行设置
  */
 @Override
 protected void onPostExecute(String result) {
  webView.loadData(result, "text/html;charset=utf-8", null);
  textView.setText(result);
  removeDialog(DIALOG_KEY);
 }

 // 该方法运行在UI线程当中,并且运行在UI线程当中 可以对UI空间进行设置
 @Override
 protected void onPreExecute() {
  showDialog(DIALOG_KEY);
 }

 /**
  * 这里的Intege参数对应AsyncTask中的第二个参数
  * 在doInBackground方法当中，，每次调用publishProgress方法都会触发onProgressUpdate执行
  * onProgressUpdate是在UI线程中执行，所有可以对UI空间进行操作
  */
 @Override
 protected void onProgressUpdate(Integer... values) {
  
 }
 }

}

热心网友时间：2022-04-21 04:43

这几天正在研究。废话不多说，直接上代码，自己研究的，通过迭代抓取。pageList就是抓取的分页页面的全部链接地址。
其中Document doc = NetUtils.getDocument(url);是jsoup抓取页面的基本操作。
public class HtmlAnalsysTest3 {
static String url = "http://www.win4000.com/mt/huge.html";
public static void main(String[] args) {
analsysPage(url);
for(String hurl : pageList){
System.out.println(" hurl-->"+hurl);
}
}
//已经抓取的用来迭代过滤
static List<String> hisurl = new ArrayList<String>();
//所需要分页链接集合
static List<String> pageList = new ArrayList<String>();
private static void analsysPage(String url){
if(hisurl.contains(url)){
System.out.println("hisurl :"+hisurl.size());
return;
}
Document doc = NetUtils.getDocument(url);
if(doc==null){
System.out.println("doc is null "+url);
return;
}
hisurl.add(url);
String tag = "body a";
String attr = "abs:href";
String herfcontent = "mt/huge";//只筛选胡歌的连接

Elements elemens = doc.select(tag);
for(Element e : elemens){
String href = e.attr(attr);
if(!href.contains(herfcontent)){
continue;
}
//System.out.println("页面page :"+ href);
if(!pageList.contains(href)){
pageList.add(href);
}
analsysPage(href);//迭代抓取，迭代过程会自动找后后续的页面
}
}
}
最终结果输出：
hurl-->http://www.win4000.com/mt/huge_2.html
hurl-->http://www.win4000.com/mt/huge_1.html
hurl-->http://www.win4000.com/mt/huge_3.html
hurl-->http://www.win4000.com/mt/huge_4.html
hurl-->http://www.win4000.com/mt/huge_5.html
hurl-->http://www.win4000.com/mt/huge.html

热心网友时间：2022-04-21 06:01

觉得吧~如果你想抓取网页分页信息可以使用第三方工具进行抓取，但是问题就来了，加入你是要自己程序实现的话，这样就很麻烦。所以建议自己实现比较好。因为分页内容，每一页都有一个特定的链接，而且很相似，就只有那个指定页数的参数不同而已。所以你可以先用遍历方式将每个网页抓取后解析，然后再存起来，这样比较实际点。
但是我建议你可以在客户端也使用分页模式，这样的话，根据需求去获取，就不会一下子请求的数据量太大。

热心网友时间：2022-04-21 07:36

可以这样，第一页的URL肯定是xxx/page_index=1这样的URL的，这个page_index就代表不同的页，所以只需要动态修改这个page_index就行了。

对于空指针的问题，可以考虑看看jsoup能否拿到状态码，只有等于200的时候才可以进行解析，或者捕捉异常、

热心网友时间：2022-04-21 11:35

{public List<String> analysePage(String url, int startPage, int endpage) throws Exception { int endPage = 0;

List<String> links = new ArrayList<String>(); try { if (startPage<=1) {
url = "http://land.fang.com/market/________1_0_1.html";
}else {
url = "http://land.fang.com/market/________1_0_"+startPage+".html";
} // 通过过滤器过滤出<A>标签 Parser parser = new Parser(url);
NodeList nodeList = parser
.extractAllNodesThatMatch(new NodeFilter()
{ // 实现该方法,用以过滤标签 public boolean accept(Node node)
{ if (node instanceof LinkTag)// 标记 return true; return false;
}

}); // 打印 String tempPage =""; for (int i = 97; i < nodeList.size(); i++)
{
LinkTag n = (LinkTag) nodeList.elementAt(i); // System.out.print(n.getStringText() + " ==>> "+n.extractLink().length()+"=="+i+"=="); if(n.extractLink().length()==69&&n.extractLink().contains("http://land.fang.com/market/")){
links.add(n.extractLink());
System.out.println(n.extractLink());
}
String title = n.getStringText(); if(isNumeric(title)){
endPage = Integer.parseInt(title)+1;

} if(isNumeric(tempPage)&&!isNumeric(title)){ break;
}
tempPage = title;

} //System.out.print(endPage+"--2222--"+links.size()); } catch (Exception e)
{
e.printStackTrace();
} if (startPage < endpage&& endpage<=endPage) {
links.addAll(analysePage(url, startPage + 1, endpage));
} for (int i=0;i<links.size();i++){ getData(links.get(i));
} return links;
} public static void getData(String introUrl){ try {
Document doc = Jsoup.connect(introUrl).get();
Elements newsHeadlines = doc.getElementsByClass("tablebox02 mt10");
Elements bianhao = doc.getElementsByClass("menubox01 mt20");
System.out.println(getSplitValue(bianhao.get(0).getElementsByTag("span").text(),":",1));

Element element = newsHeadlines.get(0).child(0);

System.out.println(element.child(0).child(0).child(1).text()); //地区 System.out.println(element.child(0).child(1).child(1).text()); //所在地 System.out.println(element.child(1).child(0).child(1).text()); //总面积 System.out.println(element.child(1).child(1).child(1).text()); // 建设用地面积 System.out.println(element.child(2).child(0).child(1).text()); //规划建筑面积 System.out.println(element.child(2).child(1).child(1).text()); //代征面积 System.out.println(getSplitValue(element.child(3).child(0).text(),"：",1)); //容积率 System.out.println(getSplitValue(element.child(3).child(1).text(),"：",1)); //绿化率 System.out.println(getSplitValue(element.child(4).child(0).text(),"：",1)); //商业比例 System.out.println(getSplitValue(element.child(4).child(1).text(),"：",1)); // 建筑密度 System.out.println(getSplitValue(element.child(5).child(0).text(),"：",1)); //*高度 System.out.println(getSplitValue(element.child(5).child(1).text(),"：",1)); //出让形式 System.out.println(getSplitValue(element.child(6).child(0).text(),"：",1)); //出让年限 System.out.println(getSplitValue(element.child(6).child(1).text(),"：",1)); //位置 System.out.println(getSplitValue(element.child(7).child(0).getElementsByAttribute("title").text(),"：",1)); //标题 System.out.println(getSplitValue(element.child(7).child(1).child(1).text(),">>",0)); //规划用途 System.out.println("=========================");
} catch (IOException e) {
e.printStackTrace();
}
} public static void main(String[] args) throws Exception { new test().analysePage("http://land.fang.com/market/________1_0_1.html",1,1); // getDownloadUrl("http://land.fang.com/market/37eae58c-c701-4e4f-b1af-3e0c8e3be1c6.html"); } public static String getSplitValue(String value,String cha,int index){
String [] strings = value.split(cha); if (strings.length>index){ return strings[index].trim();
}else { return strings[0].trim();
}

}
}