add uuid to spider
parent
6428e20543
commit
49a4ad66d3
|
@ -12,11 +12,6 @@ public class Site {
|
|||
|
||||
private String domain;
|
||||
|
||||
/**
|
||||
* for identify a task
|
||||
*/
|
||||
private String identifier;
|
||||
|
||||
private String userAgent;
|
||||
|
||||
private Map<String, String> cookies = new LinkedHashMap<String, String>();
|
||||
|
@ -66,15 +61,6 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public Site setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getEncoding() {
|
||||
return encoding;
|
||||
}
|
||||
|
@ -97,7 +83,7 @@ public class Site {
|
|||
return startUrls;
|
||||
}
|
||||
|
||||
public Site setStartUrl(String startUrl) {
|
||||
public Site addStartUrl(String startUrl) {
|
||||
this.startUrls.add(startUrl);
|
||||
return this;
|
||||
}
|
||||
|
|
|
@ -18,7 +18,7 @@ import java.util.List;
|
|||
* Date: 13-4-21
|
||||
* Time: 上午6:53
|
||||
*/
|
||||
public class Spider implements Runnable {
|
||||
public class Spider implements Runnable, Task {
|
||||
|
||||
private Downloader downloader = new HttpClientDownloader();
|
||||
|
||||
|
@ -26,6 +26,12 @@ public class Spider implements Runnable {
|
|||
|
||||
private PageProcessor pageProcessor;
|
||||
|
||||
private List<String> startUrls;
|
||||
|
||||
private Site site;
|
||||
|
||||
private String uuid;
|
||||
|
||||
private Schedular schedular = new QueueSchedular();
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
@ -36,9 +42,18 @@ public class Spider implements Runnable {
|
|||
|
||||
public Spider processor(PageProcessor pageProcessor) {
|
||||
this.pageProcessor = pageProcessor;
|
||||
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
|
||||
schedular.push(new Request(startUrl), pageProcessor.getSite());
|
||||
}
|
||||
this.site = pageProcessor.getSite();
|
||||
return this;
|
||||
}
|
||||
|
||||
public Spider startUrls(List<String> startUrls) {
|
||||
this.startUrls = startUrls;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Spider startUrl(String startUrl) {
|
||||
startUrls = new ArrayList<String>();
|
||||
startUrls.add(startUrl);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -59,13 +74,15 @@ public class Spider implements Runnable {
|
|||
|
||||
@Override
|
||||
public void run() {
|
||||
Site site = pageProcessor.getSite();
|
||||
Request request = schedular.poll(site);
|
||||
if (pipelines.isEmpty()){
|
||||
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
|
||||
schedular.push(new Request(startUrl), this);
|
||||
}
|
||||
Request request = schedular.poll(this);
|
||||
if (pipelines.isEmpty()) {
|
||||
pipelines.add(new ConsolePipeline());
|
||||
}
|
||||
while (request != null) {
|
||||
Page page = downloader.download(request,site);
|
||||
Page page = downloader.download(request, site);
|
||||
if (page == null) {
|
||||
sleep(site.getSleepTime());
|
||||
continue;
|
||||
|
@ -73,13 +90,19 @@ public class Spider implements Runnable {
|
|||
pageProcessor.process(page);
|
||||
addRequest(page);
|
||||
for (Pipeline pipeline : pipelines) {
|
||||
pipeline.process(page,site);
|
||||
pipeline.process(page, this);
|
||||
}
|
||||
sleep(site.getSleepTime());
|
||||
request = schedular.poll(site);
|
||||
request = schedular.poll(this);
|
||||
}
|
||||
}
|
||||
|
||||
public Spider setUUID(String uuid) {
|
||||
this.uuid = uuid;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
private void sleep(int time) {
|
||||
try {
|
||||
Thread.sleep(time);
|
||||
|
@ -91,8 +114,19 @@ public class Spider implements Runnable {
|
|||
private void addRequest(Page page) {
|
||||
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
||||
for (Request request : page.getTargetRequests()) {
|
||||
schedular.push(request,pageProcessor.getSite());
|
||||
schedular.push(request, this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getUUID() {
|
||||
if (uuid != null) {
|
||||
return uuid;
|
||||
}
|
||||
if (site != null) {
|
||||
return site.getDomain();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
/**
|
||||
* Author: code4crafer@gmail.com
|
||||
* Date: 13-6-18
|
||||
* Time: 下午2:57
|
||||
*/
|
||||
public interface Task {
|
||||
|
||||
public String getUUID();
|
||||
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
|
||||
import java.util.Map;
|
||||
|
@ -14,7 +14,7 @@ import java.util.Map;
|
|||
public class ConsolePipeline implements Pipeline{
|
||||
|
||||
@Override
|
||||
public void process(Page page,Site site) {
|
||||
public void process(Page page,Task task) {
|
||||
System.out.println("get page: "+page.getUrl());
|
||||
for (Map.Entry<String, Selectable> entry : page.getFields().entrySet()) {
|
||||
System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings());
|
||||
|
|
|
@ -2,9 +2,8 @@ package us.codecraft.webmagic.pipeline;
|
|||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
|
@ -30,10 +29,8 @@ public class FilePipeline implements Pipeline {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void process(Page page, Site site) {
|
||||
String domain = site.getDomain();
|
||||
domain = UrlUtils.getDomain(domain);
|
||||
String path = this.path + "" + domain + "#" + site.getIdentifier() + "/";
|
||||
public void process(Page page, Task task) {
|
||||
String path = this.path + "/" + task.getUUID() + "/";
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* Author: code4crafter@gmail.com
|
||||
|
@ -10,5 +10,5 @@ import us.codecraft.webmagic.Site;
|
|||
*/
|
||||
public interface Pipeline {
|
||||
|
||||
public void process(Page page,Site site);
|
||||
public void process(Page page,Task task);
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@ public class SimplePageProcessor implements PageProcessor {
|
|||
private Site site;
|
||||
|
||||
public SimplePageProcessor(String startUrl, String urlPattern) {
|
||||
this.site = Site.me().setStartUrl(startUrl).
|
||||
this.site = Site.me().addStartUrl(startUrl).
|
||||
setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA);
|
||||
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
|
||||
|
||||
|
|
|
@ -2,8 +2,8 @@ package us.codecraft.webmagic.schedular;
|
|||
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.LinkedHashSet;
|
||||
|
@ -28,7 +28,7 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
|
||||
private String fileUrlAllName = ".urls.txt";
|
||||
|
||||
private Site site;
|
||||
private Task task;
|
||||
|
||||
private String fileCursor = ".cursor.txt";
|
||||
|
||||
|
@ -44,13 +44,13 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
|
||||
private Set<String> urls;
|
||||
|
||||
public FileCacheQueueSchedular(Site site) {
|
||||
this.site = site;
|
||||
public FileCacheQueueSchedular(Task task) {
|
||||
this.task = task;
|
||||
}
|
||||
|
||||
public FileCacheQueueSchedular(Site site, String filePath) {
|
||||
public FileCacheQueueSchedular(Task task, String filePath) {
|
||||
this.filePath = filePath;
|
||||
this.site = site;
|
||||
this.task = task;
|
||||
}
|
||||
|
||||
private void flush() {
|
||||
|
@ -106,7 +106,7 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
urls.add(line.trim());
|
||||
lineReaded++;
|
||||
if (lineReaded > cursor.get()) {
|
||||
queue.add(new Request(line, site));
|
||||
queue.add(new Request(line));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -121,11 +121,11 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
}
|
||||
|
||||
private String getFileName(String filename) {
|
||||
return filePath + site.getDomain() + "#" + site.getIdentifier() + filename;
|
||||
return filePath + task.getUUID() + "/" + filename;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void push(Request request, Site site) {
|
||||
public synchronized void push(Request request, Task task) {
|
||||
if (!inited.get()) {
|
||||
init();
|
||||
}
|
||||
|
@ -140,7 +140,7 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
}
|
||||
|
||||
@Override
|
||||
public synchronized Request poll(Site site) {
|
||||
public synchronized Request poll(Task task) {
|
||||
if (!inited.get()) {
|
||||
init();
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ package us.codecraft.webmagic.schedular;
|
|||
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
@ -23,7 +23,7 @@ public class QueueSchedular implements Schedular {
|
|||
private Set<String> urls = new HashSet<String>();
|
||||
|
||||
@Override
|
||||
public synchronized void push(Request request,Site site) {
|
||||
public synchronized void push(Request request,Task task) {
|
||||
if (logger.isDebugEnabled()){
|
||||
logger.debug("push to queue "+request.getUrl());
|
||||
}
|
||||
|
@ -34,7 +34,7 @@ public class QueueSchedular implements Schedular {
|
|||
}
|
||||
|
||||
@Override
|
||||
public synchronized Request poll(Site site) {
|
||||
public synchronized Request poll(Task task) {
|
||||
return queue.poll();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package us.codecraft.webmagic.schedular;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* Author: code4crafter@gmail.com
|
||||
|
@ -10,8 +10,8 @@ import us.codecraft.webmagic.Site;
|
|||
*/
|
||||
public interface Schedular {
|
||||
|
||||
public void push(Request request,Site site);
|
||||
public void push(Request request,Task task);
|
||||
|
||||
public Request poll(Site site);
|
||||
public Request poll(Task task);
|
||||
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ public class DiandianBlogProcessor implements PageProcessor {
|
|||
public Site getSite() {
|
||||
//site定义抽取配置,以及开始url等
|
||||
if (site == null) {
|
||||
site = Site.me().setDomain("progressdaily.diandian.com").setStartUrl("http://progressdaily.diandian.com/").
|
||||
site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
return site;
|
||||
|
|
|
@ -27,7 +27,7 @@ public class DianpingBlogProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/").
|
||||
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
|||
@Override
|
||||
public Site getSite() {
|
||||
if (site==null){
|
||||
site= Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
||||
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500);
|
||||
}
|
||||
return site;
|
||||
|
|
|
@ -23,6 +23,6 @@ public class F58PageProcesser implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("sh.58.com").setStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates.
|
||||
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@ public class HuxiuProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("www.huxiu.com").setStartUrl("http://www.huxiu.com/").
|
||||
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ public class KaichibaProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
|
||||
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ public class MeicanProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
|
||||
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ public class NjuBBSProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("bbs.nju.edu.cn").setStartUrl("http://bbs.nju.edu.cn/board?board=Pictures").
|
||||
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/").
|
||||
return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@ public class OschinaPageProcesser implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("www.oschina.net").setStartUrl("http://www.oschina.net/").
|
||||
return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ public class QzoneBlogProcessor implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/").
|
||||
return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ public class SinaBlogProcesser implements PageProcessor {
|
|||
@Override
|
||||
public Site getSite() {
|
||||
if (site==null){
|
||||
site = Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000).
|
||||
site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000).
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
return site;
|
||||
|
|
|
@ -23,6 +23,6 @@ public class TianyaPageProcesser implements PageProcessor {
|
|||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("http://bbs.tianya.cn/").setStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
|
||||
return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
<item>
|
||||
<title>${title}</title>
|
||||
<link>http://127.0.0.1/wordpress/?p=${id}</link>
|
||||
<link>http://127.0.0.1/wordpress/?p=${uuid}</link>
|
||||
<pubDate>${date}</pubDate>
|
||||
<dc:creator>admin</dc:creator>
|
||||
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${id}</guid>
|
||||
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${uuid}</guid>
|
||||
<description></description>
|
||||
<content:encoded><![CDATA[${content}]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||
<wp:post_id>${id}</wp:post_id>
|
||||
<wp:post_id>${uuid}</wp:post_id>
|
||||
<wp:post_date>${date}</wp:post_date>
|
||||
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||
<wp:comment_status>open</wp:comment_status>
|
||||
|
|
Loading…
Reference in New Issue