add offline cache and process
parent
a7316a1f57
commit
5c79550fd9
|
@ -18,7 +18,7 @@ public class Site {
|
||||||
|
|
||||||
private Map<String, String> cookies = new LinkedHashMap<String, String>();
|
private Map<String, String> cookies = new LinkedHashMap<String, String>();
|
||||||
|
|
||||||
private String encoding;
|
private String charset;
|
||||||
|
|
||||||
private List<String> startUrls = new ArrayList<String>();
|
private List<String> startUrls = new ArrayList<String>();
|
||||||
|
|
||||||
|
@ -107,11 +107,11 @@ public class Site {
|
||||||
* 设置页面编码,若不设置则自动根据Html meta信息获取。<br>
|
* 设置页面编码,若不设置则自动根据Html meta信息获取。<br>
|
||||||
* 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br>
|
* 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。<br>
|
||||||
*
|
*
|
||||||
* @param encoding 编码格式,主要是"utf-8"、"gbk"两种
|
* @param charset 编码格式,主要是"utf-8"、"gbk"两种
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site setEncoding(String encoding) {
|
public Site setCharset(String charset) {
|
||||||
this.encoding = encoding;
|
this.charset = charset;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -120,8 +120,8 @@ public class Site {
|
||||||
*
|
*
|
||||||
* @return 已设置的domain
|
* @return 已设置的domain
|
||||||
*/
|
*/
|
||||||
public String getEncoding() {
|
public String getCharset() {
|
||||||
return encoding;
|
return charset;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -194,18 +194,32 @@ public class Site {
|
||||||
return false;
|
return false;
|
||||||
if (!domain.equals(site.domain)) return false;
|
if (!domain.equals(site.domain)) return false;
|
||||||
if (!startUrls.equals(site.startUrls)) return false;
|
if (!startUrls.equals(site.startUrls)) return false;
|
||||||
if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false;
|
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
|
||||||
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
|
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Task toTask(){
|
||||||
|
return new Task() {
|
||||||
|
@Override
|
||||||
|
public String getUUID() {
|
||||||
|
return Site.this.getDomain();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.this;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
int result = domain.hashCode();
|
int result = domain.hashCode();
|
||||||
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
|
result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0);
|
||||||
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
|
||||||
result = 31 * result + (encoding != null ? encoding.hashCode() : 0);
|
result = 31 * result + (charset != null ? charset.hashCode() : 0);
|
||||||
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
|
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -126,6 +126,12 @@ public class Spider implements Runnable, Task {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Spider downloader(Downloader downloader) {
|
||||||
|
checkIfNotRunning();
|
||||||
|
this.downloader = downloader;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
|
@ -180,7 +186,7 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void processRequest(Request request) {
|
private void processRequest(Request request) {
|
||||||
Page page = downloader.download(request, site);
|
Page page = downloader.download(request, this);
|
||||||
if (page == null) {
|
if (page == null) {
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
return;
|
return;
|
||||||
|
@ -216,12 +222,7 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void runAsync(){
|
public void runAsync(){
|
||||||
Thread thread = new Thread(){
|
Thread thread = new Thread(this);
|
||||||
@Override
|
|
||||||
public void run() {
|
|
||||||
Spider.this.run();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
thread.setDaemon(false);
|
thread.setDaemon(false);
|
||||||
thread.start();
|
thread.start();
|
||||||
}
|
}
|
||||||
|
@ -252,4 +253,9 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return site;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,4 +14,10 @@ public interface Task {
|
||||||
*/
|
*/
|
||||||
public String getUUID();
|
public String getUUID();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 返回任务抓取的站点信息
|
||||||
|
* @return site
|
||||||
|
*/
|
||||||
|
public Site getSite();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,7 @@ package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
|
* Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
|
||||||
|
@ -16,8 +16,8 @@ public interface Downloader {
|
||||||
* 下载页面,并保存信息到Page对象中。
|
* 下载页面,并保存信息到Page对象中。
|
||||||
*
|
*
|
||||||
* @param request
|
* @param request
|
||||||
* @param site
|
* @param task
|
||||||
* @return page
|
* @return page
|
||||||
*/
|
*/
|
||||||
public Page download(Request request, Site site);
|
public Page download(Request request, Task task);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
* Date: 13-6-24
|
||||||
|
* Time: 上午7:24
|
||||||
|
*/
|
||||||
|
public class FileDownloader implements Downloader {
|
||||||
|
|
||||||
|
private String path = "/data/temp/webmagic/";
|
||||||
|
|
||||||
|
private Downloader downloaderWhenFileMiss;
|
||||||
|
|
||||||
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
|
public FileDownloader() {
|
||||||
|
this("/data/temp/webmagic/", null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public FileDownloader(String path) {
|
||||||
|
this(path, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public FileDownloader(String path, Downloader downloaderWhenFileMiss) {
|
||||||
|
this.path = path;
|
||||||
|
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Page download(Request request, Task task) {
|
||||||
|
String path = this.path + "/" + task.getUUID() + "/";
|
||||||
|
Page page = null;
|
||||||
|
try {
|
||||||
|
final File file = new File(path + DigestUtils.md5Hex(request.getUrl()));
|
||||||
|
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
|
||||||
|
String line = null;
|
||||||
|
line = bufferedReader.readLine();
|
||||||
|
if (line.equals("url:\t" + request.getUrl())) {
|
||||||
|
final String html = getHtml(bufferedReader);
|
||||||
|
page = new Page();
|
||||||
|
page.setRequest(request);
|
||||||
|
page.setUrl(PlainText.create(request.getUrl()));
|
||||||
|
page.setHtml(Html.create(html));
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
if (e instanceof FileNotFoundException) {
|
||||||
|
logger.info("File not exist for url " + request.getUrl());
|
||||||
|
} else {
|
||||||
|
logger.warn("File read error for url " + request.getUrl(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (page == null) {
|
||||||
|
page = downloadWhenMiss(request, task);
|
||||||
|
}
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getHtml(BufferedReader bufferedReader) throws IOException {
|
||||||
|
String line;
|
||||||
|
StringBuilder htmlBuilder= new StringBuilder();
|
||||||
|
line = bufferedReader.readLine();
|
||||||
|
line = StringUtils.removeStart(line, "html:\t");
|
||||||
|
htmlBuilder.append(line);
|
||||||
|
while ((line=bufferedReader.readLine())!=null){
|
||||||
|
htmlBuilder.append(line);
|
||||||
|
}
|
||||||
|
return htmlBuilder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Page downloadWhenMiss(Request request, Task task) {
|
||||||
|
Page page = null;
|
||||||
|
if (downloaderWhenFileMiss != null) {
|
||||||
|
page = downloaderWhenFileMiss.download(request, task);
|
||||||
|
}
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
}
|
|
@ -11,6 +11,7 @@ import org.apache.log4j.Logger;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
@ -26,24 +27,25 @@ public class HttpClientDownloader implements Downloader {
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Site site) {
|
public Page download(Request request, Task task) {
|
||||||
|
Site site = task.getSite();
|
||||||
logger.info("downloading page " + request.getUrl());
|
logger.info("downloading page " + request.getUrl());
|
||||||
HttpClient httpClient = HttpClientPool.getInstance().getClient(site);
|
HttpClient httpClient = HttpClientPool.getInstance().getClient(site);
|
||||||
String encoding = site.getEncoding();
|
String charset = site.getCharset();
|
||||||
try {
|
try {
|
||||||
HttpGet httpGet = new HttpGet(request.getUrl());
|
HttpGet httpGet = new HttpGet(request.getUrl());
|
||||||
HttpResponse httpResponse = httpClient.execute(httpGet);
|
HttpResponse httpResponse = httpClient.execute(httpGet);
|
||||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
if (site.getAcceptStatCode().contains(statusCode)) {
|
if (site.getAcceptStatCode().contains(statusCode)) {
|
||||||
//charset
|
//charset
|
||||||
if (encoding == null){
|
if (charset == null){
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString());
|
charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
|
||||||
}
|
}
|
||||||
//
|
//
|
||||||
handleGzip(httpResponse);
|
handleGzip(httpResponse);
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
|
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
|
||||||
site.getEncoding());
|
charset);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
|
|
|
@ -1,15 +1,14 @@
|
||||||
package us.codecraft.webmagic.pipeline;
|
package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.selector.Selectable;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
@ -20,6 +19,8 @@ public class FilePipeline implements Pipeline {
|
||||||
|
|
||||||
private String path = "/data/temp/webmagic/";
|
private String path = "/data/temp/webmagic/";
|
||||||
|
|
||||||
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
public FilePipeline() {
|
public FilePipeline() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -36,15 +37,12 @@ public class FilePipeline implements Pipeline {
|
||||||
file.mkdirs();
|
file.mkdirs();
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html"));
|
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString())));
|
||||||
printWriter.println("url:\t" + page.getUrl());
|
printWriter.println("url:\t" + page.getUrl());
|
||||||
for (Map.Entry<String, Selectable> entry : page.getFields().entrySet()) {
|
printWriter.println("html:\t" + page.getHtml());
|
||||||
printWriter.println(entry.getKey() + ":\t" + entry.getValue().toStrings());
|
|
||||||
}
|
|
||||||
printWriter.close();
|
printWriter.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
logger.warn("write file error",e);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -75,7 +75,7 @@ public class UrlUtils {
|
||||||
return domain;
|
return domain;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Pattern patternForHref = Pattern.compile("(<a[^<>]*href=)[\"']{0,1}([^\"']*)[\"']{0,1}", Pattern.CASE_INSENSITIVE);
|
private static Pattern patternForHref = Pattern.compile("(<a[^<>]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
public static String fixAllRelativeHrefs(String html, String url) {
|
public static String fixAllRelativeHrefs(String html, String url) {
|
||||||
StringBuilder stringBuilder = new StringBuilder();
|
StringBuilder stringBuilder = new StringBuilder();
|
||||||
|
|
|
@ -19,7 +19,7 @@ public class HttpClientDownloaderTest {
|
||||||
public void testCookie() {
|
public void testCookie() {
|
||||||
Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
|
Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix");
|
||||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site);
|
Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site.toTask());
|
||||||
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
|
Assert.assertTrue(download.getHtml().toString().contains("flashsword30"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site==null){
|
if (site==null){
|
||||||
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500);
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.downloader.FileDownloader;
|
||||||
|
import us.codecraft.webmagic.downloader.HttpClientDownloader;
|
||||||
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Author yihua.huang@dianping.com
|
||||||
|
* Date: 13-6-24
|
||||||
|
* Time: 下午2:12
|
||||||
|
*/
|
||||||
|
public class GlobalProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
private Site site;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
final List<String> requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings();
|
||||||
|
page.addTargetRequests(requests);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
if (site==null){
|
||||||
|
site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
|
}
|
||||||
|
return site;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new GlobalProcessor()).thread(10)
|
||||||
|
.scheduler(new FileCacheQueueScheduler("/data/webmagic/github"))
|
||||||
|
.downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader()))
|
||||||
|
.pipeline(new FilePipeline("/data/webmagic/douban"))
|
||||||
|
.run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -21,7 +21,7 @@ public class KaichibaProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
|
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,7 +27,7 @@ public class MeicanProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
|
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,7 @@ public class SpiderTest {
|
||||||
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
|
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
|
||||||
// processor(pageProcessor).run();
|
// processor(pageProcessor).run();
|
||||||
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
|
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
|
||||||
System.out.println(pageProcessor2.getSite().getEncoding());
|
System.out.println(pageProcessor2.getSite().getCharset());
|
||||||
pageProcessor2.getSite().setSleepTime(500);
|
pageProcessor2.getSite().setSleepTime(500);
|
||||||
Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||||
run();
|
run();
|
||||||
|
|
Loading…
Reference in New Issue