diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 423b0a6..676584a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -18,7 +18,7 @@ public class Site { private Map cookies = new LinkedHashMap(); - private String encoding; + private String charset; private List startUrls = new ArrayList(); @@ -107,11 +107,11 @@ public class Site { * 设置页面编码,若不设置则自动根据Html meta信息获取。
* 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。
* - * @param encoding 编码格式,主要是"utf-8"、"gbk"两种 + * @param charset 编码格式,主要是"utf-8"、"gbk"两种 * @return this */ - public Site setEncoding(String encoding) { - this.encoding = encoding; + public Site setCharset(String charset) { + this.charset = charset; return this; } @@ -120,8 +120,8 @@ public class Site { * * @return 已设置的domain */ - public String getEncoding() { - return encoding; + public String getCharset() { + return charset; } /** @@ -194,18 +194,32 @@ public class Site { return false; if (!domain.equals(site.domain)) return false; if (!startUrls.equals(site.startUrls)) return false; - if (encoding != null ? !encoding.equals(site.encoding) : site.encoding != null) return false; + if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; return true; } + public Task toTask(){ + return new Task() { + @Override + public String getUUID() { + return Site.this.getDomain(); + } + + @Override + public Site getSite() { + return Site.this; + } + }; + } + @Override public int hashCode() { int result = domain.hashCode(); result = 31 * result + (startUrls != null ? startUrls.hashCode() : 0); result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); - result = 31 * result + (encoding != null ? encoding.hashCode() : 0); + result = 31 * result + (charset != null ? charset.hashCode() : 0); result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); return result; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a5e0624..b2a2fa6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -126,6 +126,12 @@ public class Spider implements Runnable, Task { return this; } + public Spider downloader(Downloader downloader) { + checkIfNotRunning(); + this.downloader = downloader; + return this; + } + @Override public void run() { @@ -180,7 +186,7 @@ public class Spider implements Runnable, Task { } private void processRequest(Request request) { - Page page = downloader.download(request, site); + Page page = downloader.download(request, this); if (page == null) { sleep(site.getSleepTime()); return; @@ -216,12 +222,7 @@ public class Spider implements Runnable, Task { } public void runAsync(){ - Thread thread = new Thread(){ - @Override - public void run() { - Spider.this.run(); - } - }; + Thread thread = new Thread(this); thread.setDaemon(false); thread.start(); } @@ -252,4 +253,9 @@ public class Spider implements Runnable, Task { } return null; } + + @Override + public Site getSite() { + return site; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java index 136b467..14c1d31 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java @@ -14,4 +14,10 @@ public interface Task { */ public String getUUID(); + /** + * 返回任务抓取的站点信息 + * @return site + */ + public Site getSite(); + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index e3ecff8..9a8bac1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -2,7 +2,7 @@ package us.codecraft.webmagic.downloader; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; /** * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。 @@ -16,8 +16,8 @@ public interface Downloader { * 下载页面,并保存信息到Page对象中。 * * @param request - * @param site + * @param task * @return page */ - public Page download(Request request, Site site); + public Page download(Request request, Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java new file mode 100644 index 0000000..b4a49ac --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java @@ -0,0 +1,88 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +import java.io.*; + +/** + * @author code4crafer@gmail.com + * Date: 13-6-24 + * Time: 上午7:24 + */ +public class FileDownloader implements Downloader { + + private String path = "/data/temp/webmagic/"; + + private Downloader downloaderWhenFileMiss; + + private Logger logger = Logger.getLogger(getClass()); + + public FileDownloader() { + this("/data/temp/webmagic/", null); + } + + public FileDownloader(String path) { + this(path, null); + } + + public FileDownloader(String path, Downloader downloaderWhenFileMiss) { + this.path = path; + this.downloaderWhenFileMiss = downloaderWhenFileMiss; + } + + @Override + public Page download(Request request, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + Page page = null; + try { + final File file = new File(path + DigestUtils.md5Hex(request.getUrl())); + BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); + String line = null; + line = bufferedReader.readLine(); + if (line.equals("url:\t" + request.getUrl())) { + final String html = getHtml(bufferedReader); + page = new Page(); + page.setRequest(request); + page.setUrl(PlainText.create(request.getUrl())); + page.setHtml(Html.create(html)); + } + } catch (IOException e) { + if (e instanceof FileNotFoundException) { + logger.info("File not exist for url " + request.getUrl()); + } else { + logger.warn("File read error for url " + request.getUrl(), e); + } + } + if (page == null) { + page = downloadWhenMiss(request, task); + } + return page; + } + + private String getHtml(BufferedReader bufferedReader) throws IOException { + String line; + StringBuilder htmlBuilder= new StringBuilder(); + line = bufferedReader.readLine(); + line = StringUtils.removeStart(line, "html:\t"); + htmlBuilder.append(line); + while ((line=bufferedReader.readLine())!=null){ + htmlBuilder.append(line); + } + return htmlBuilder.toString(); + } + + private Page downloadWhenMiss(Request request, Task task) { + Page page = null; + if (downloaderWhenFileMiss != null) { + page = downloaderWhenFileMiss.download(request, task); + } + return page; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 1b628cd..d2c2d62 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -11,6 +11,7 @@ import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; @@ -26,24 +27,25 @@ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); @Override - public Page download(Request request, Site site) { + public Page download(Request request, Task task) { + Site site = task.getSite(); logger.info("downloading page " + request.getUrl()); HttpClient httpClient = HttpClientPool.getInstance().getClient(site); - String encoding = site.getEncoding(); + String charset = site.getCharset(); try { HttpGet httpGet = new HttpGet(request.getUrl()); HttpResponse httpResponse = httpClient.execute(httpGet); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (site.getAcceptStatCode().contains(statusCode)) { //charset - if (encoding == null){ + if (charset == null){ String value = httpResponse.getEntity().getContentType().getValue(); - site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString()); + charset = new PlainText(value).regex("charset=([^\\s]+)").toString(); } // handleGzip(httpResponse); String content = IOUtils.toString(httpResponse.getEntity().getContent(), - site.getEncoding()); + charset); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index c7cd9c5..b079dcc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,15 +1,14 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.selector.Selectable; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; -import java.util.Map; /** * @author code4crafter@gmail.com
@@ -20,6 +19,8 @@ public class FilePipeline implements Pipeline { private String path = "/data/temp/webmagic/"; + private Logger logger = Logger.getLogger(getClass()); + public FilePipeline() { } @@ -36,15 +37,12 @@ public class FilePipeline implements Pipeline { file.mkdirs(); } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()))); printWriter.println("url:\t" + page.getUrl()); - for (Map.Entry entry : page.getFields().entrySet()) { - printWriter.println(entry.getKey() + ":\t" + entry.getValue().toStrings()); - } + printWriter.println("html:\t" + page.getHtml()); printWriter.close(); } catch (IOException e) { - e.printStackTrace(); + logger.warn("write file error",e); } - } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 74e486c..667aaf2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -75,7 +75,7 @@ public class UrlUtils { return domain; } - private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"']*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); + private static Pattern patternForHref = Pattern.compile("(]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); public static String fixAllRelativeHrefs(String html, String url) { StringBuilder stringBuilder = new StringBuilder(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 5e83422..936aece 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -19,7 +19,7 @@ public class HttpClientDownloaderTest { public void testCookie() { Site site = Site.me().setDomain("www.diandian.com").addCookie("t", "yct7q7e6v319wpg4cpxqduu5m77lcgix"); HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); - Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site); + Page download = httpClientDownloader.download(new Request("http://www.diandian.com"), site.toTask()); Assert.assertTrue(download.getHtml().toString().contains("flashsword30")); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 13ed2e1..695d2e2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -34,7 +34,7 @@ public class DiaoyuwengProcessor implements PageProcessor { public Site getSite() { if (site==null){ site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500); + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500); } return site; } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java new file mode 100644 index 0000000..07f0101 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.FileDownloader; +import us.codecraft.webmagic.downloader.HttpClientDownloader; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; + +import java.util.List; + +/** + * Author yihua.huang@dianping.com + * Date: 13-6-24 + * Time: 下午2:12 + */ +public class GlobalProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + final List requests = page.getHtml().links().regex(".*book\\.douban\\.com.*").toStrings(); + page.addTargetRequests(requests); + + } + + @Override + public Site getSite() { + if (site==null){ + site = Site.me().setDomain("douban.com").addStartUrl("http://book.douban.com/").setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new GlobalProcessor()).thread(10) + .scheduler(new FileCacheQueueScheduler("/data/webmagic/github")) + .downloader(new FileDownloader("/data/webmagic/douban", new HttpClientDownloader())) + .pipeline(new FilePipeline("/data/webmagic/douban")) + .run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 0a51b36..aff18a6 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -21,7 +21,7 @@ public class KaichibaProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). + return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index bd21811..39f5723 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -27,7 +27,7 @@ public class MeicanProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). + return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index f2668f2..76a423f 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -29,7 +29,7 @@ public class SpiderTest { // Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); - System.out.println(pageProcessor2.getSite().getEncoding()); + System.out.println(pageProcessor2.getSite().getCharset()); pageProcessor2.getSite().setSleepTime(500); Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run();