diff --git a/pom.xml b/pom.xml index 5974eae..cacce99 100644 --- a/pom.xml +++ b/pom.xml @@ -3,15 +3,15 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - 0.1.0 + 0.2.0 4.0.0 pom webmagic - - webmagic-core - webmagic-plugin/ - webmagic-samples/ + + webmagic-core + webmagic-extension/ + webmagic-samples/ @@ -27,6 +27,11 @@ httpclient 4.2.4 + + net.sf.saxon + Saxon-HE + 9.5.1-1 + log4j log4j @@ -45,7 +50,7 @@ net.sourceforge.htmlcleaner htmlcleaner - 2.4 + 2.5 org.apache.commons @@ -75,6 +80,7 @@ org.apache.maven.plugins maven-dependency-plugin + 2.8 copy-dependencies @@ -94,6 +100,7 @@ org.apache.maven.plugins maven-resources-plugin + 2.6 UTF-8 @@ -101,6 +108,7 @@ org.apache.maven.plugins maven-source-plugin + 2.2.1 attach-sources @@ -113,6 +121,10 @@ org.apache.maven.plugins maven-javadoc-plugin + 2.9.1 + + UTF-8 + attach-javadocs @@ -125,11 +137,10 @@ org.apache.maven.plugins maven-release-plugin - 2.0-beta-7 + 2.4.1 - diff --git a/webmagic manual.md b/webmagic manual.md new file mode 100644 index 0000000..0c681c5 --- /dev/null +++ b/webmagic manual.md @@ -0,0 +1,360 @@ +webmagic使用手册 +------ +>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 + +>web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录、抓取静态资源等。 + +>作者黄亿华([code4crafter@gmail.com](code4crafter@gmail.com))目前就职于大众点评,曾经在前公司进行过一年的垂直爬虫的开发,webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。 + +>webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: + +>python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) + +>Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman) + +>webmagic遵循[Apache 2.0协议](http://www.apache.org/licenses/LICENSE-2.0.html),你可以自由进行使用和修改。有使用不便或者问题,欢迎在github[提交issue](https://github.com/code4craft/webmagic/issues),或者在[oschina讨论模块](http://www.oschina.net/question)提问。 + +
+ + +## 快速开始 + +### 使用maven + +webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译: + + git clone https://github.com/code4craft/webmagic.git + mvn clean install + +安装后,在项目中添加对应的依赖即可使用webmagic: + + + us.codecraft + webmagic-core + 0.2.0 + + + us.codecraft + webmagic-extension + 0.2.0 + + +#### 项目结构 + +webmagic主要包括两个包: + +* **webmagic-core** + + webmagic核心部分,只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 + +* **webmagic-extension** + + webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。 + +webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来: + +* **webmagic-saxon** + + webmagic与Saxon结合的模块。Saxon是一个XPath、XSLT的解析工具,webmagic依赖Saxon来进行XPath2.0语法解析支持。 + +* **webmagic-selenium** + + webmagic与Selenium结合的模块。Selenium是一个模拟浏览器进行页面渲染的工具,webmagic依赖Selenium进行动态页面的抓取。 + +在项目中,你可以根据需要依赖不同的包。 + +### 不使用maven + +不使用maven的用户,可以下载这个二进制打包版本(感谢[oschina](http://www.oschina.net/)): + + git clone http://git.oschina.net/flashsword20/webmagic-bin.git + +在**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。 + +### 第一个爬虫 + +#### 定制PageProcessor + +PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: + + public class OschinaBlogPageProcesser implements PageProcessor { + + private Site site = Site.me().setDomain("my.oschina.net") + .addStartUrl("http://my.oschina.net/flashsword/blog"); + + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); + } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()) + .pipeline(new ConsolePipeline()).run(); + } + } + +这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。 + +Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这里ConsolePipeline表示结果输出到控制台。 + +执行这个main方法,即可在控制台看到抓取结果。webmagic默认有3秒抓取间隔,请耐心等待。 + +#### 使用注解 + +webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同: + + @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") + public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + public static void main(String[] args) { + OOSpider.create( + Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), + new ConsolePageModelPipeline(), OschinaBlog.class).run(); + } + } + +这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 + +注解的详细使用方式见后文中得webmagic-extension注解模块。 + +
+ + +## webmagic-core + +webmagic-core是爬虫的核心框架,只包括一个爬虫各功能模块的核心功能。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 + +此节部分内容摘自作者的博文 +[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796)。 + +### webmagic-core的模块划分 + +webmagic-core参考了scrapy的模块划分,分为Spider(整个爬虫的调度框架)、Downloader(页面下载)、PageProcessor(链接提取和页面分析)、Scheduler(URL管理)、Pipeline(离线分析和持久化)几部分。只不过scrapy通过middleware实现扩展,而webmagic则通过定义这几个接口,并将其不同的实现注入主框架类Spider来实现扩展。 + +![image](http://code4craft.github.io/images/posts/webmagic.png) +
+ +#### Spider类(核心调度) + +**Spider**是爬虫的入口类,Spider的接口调用采用了链式的API设计,其他功能全部通过接口注入Spider实现,下面是启动一个比较复杂的Spider的例子。 + + Spider.create(sinaBlogProcessor) + .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")) + .pipeline(new FilePipeline()) + .thread(10).run(); + + +Spider的核心处理流程非常简单,代码如下: + + + private void processRequest(Request request) { + Page page = downloader.download(request, this); + if (page == null) { + sleep(site.getSleepTime()); + return; + } + pageProcessor.process(page); + addRequest(page); + for (Pipeline pipeline : pipelines) { + pipeline.process(page, this); + } + sleep(site.getSleepTime()); + } + +#### PageProcessor(页面分析及链接抽取) + +页面分析是垂直爬虫中需要定制的部分。在webmagic-core里,通过实现**PageProcessor**接口来实现定制爬虫。PageProcessor有两个核心方法:public void process(Page page)和public Site getSite() 。 + +* public void process(Page page) + + 通过对**Page**对象的操作,实现爬虫逻辑。Page对象包括两个最重要的方法:addTargetRequests()可以添加URL到待抓取队列,put()可以将结果保存供后续处理。 + Page的数据可以通过Page.getHtml()和Page.getUrl()获取。 + +* public Site getSite() + + **Site**对象定义了爬虫的域名、起始地址、抓取间隔、编码等信息。 + +**Selector**是webmagic为了简化页面抽取开发的独立模块,是webmagic-core的主要着力点。这里整合了CSS Selector、XPath和正则表达式,并可以进行链式的抽取。 + + + //content是用别的爬虫工具抽取到的正文 + List links = page.getHtml() + .$("div.title") //css 选择,Java里虽然很少有$符号出现,不过貌似$作为方法名是合法的 + .xpath("//@href") //提取链接 + .regex(".*blog.*") //正则匹配过滤 + .all(); //转换为string列表 + +webmagic包括一个对于页面正文的自动抽取的类**SmartContentSelector**。相信用过Evernote Clearly都会对其自动抽取正文的技术印象深刻。这个技术又叫**Readability**。当然webmagic对Readability的实现还比较粗略,但是仍有一些学习价值。 + +基于Saxon,webmagic提供了XPath2.0语法的支持。XPath2.0语法支持内部函数、逻辑控制等,是一门完整的语言,如果你熟悉XPath2.0语法,倒是不妨一试(需要引入**webmagic-saxon**包)。 + +**webmagic-samples**包里有一些为某个站点定制的PageProcessor,供学习之用。 + +#### Downloader(页面下载) + +**Downloader**是webmagic中下载页面的接口,主要方法: + +* public Page download(Request request, Task task) + + **Request**对象封装了待抓取的URL及其他信息,而Page则包含了页面下载后的Html及其他信息。Task是一个包装了任务对应的Site信息的抽象接口。 + +* public void setThread(int thread) + + 因为Downloader一般会涉及连接池等功能,而这些功能与多线程密切相关,所以定义了此方法。 + +目前有几个Downloader的实现: + +* HttpClientDownloader + + 集成了**Apache HttpClient**的Downloader。Apache HttpClient(4.0后整合到HttpCompenent项目中)是强大的Java http下载器,它支持自定义HTTP头(对于爬虫比较有用的就是User-agent、cookie等)、自动redirect、连接复用、cookie保留、设置代理等诸多强大的功能。 + +* SeleniumDownloader + + 对于一些Javascript动态加载的网页,仅仅使用http模拟下载工具,并不能取到页面的内容。这方面的思路有两种:一种是抽丝剥茧,分析js的逻辑,再用爬虫去重现它;另一种就是:内置一个浏览器,直接获取最后加载完的页面。**webmagic-selenium**包中整合了Selenium到SeleniumDownloader,可以直接进行动态加载页面的抓取。 + +#### Scheduler(URL管理) + +**Scheduler**是webmagic的管理模块,通过实现Scheduler可以定制自己的URL管理器。Scheduler包括两个主要方法: + +* public void push(Request request,Task task) + + 将待抓取URL加入Scheduler。Request对象是对URL的一个封装,还包括优先级、以及一个供存储数据的Map。Task仍然用于区分不同任务,在多个任务公用一个Scheduler时可以此进行区分。 + +* public Request poll(Task task) + + 从Scheduler里取出一条请求,并进行后续执行。 + +webmagic目前有三个Scheduler的实现: + +* QueueScheduler + + 一个简单的内存队列,速度较快,并且是线程安全的。 + +* FileCacheQueueScheduler + + 使用文件保存队列,它可以用于耗时较长的下载任务,在任务中途停止后(手动停止或者程序崩溃),下次执行仍然从中止的URL开始继续爬取。 + +* RedisScheduler + + 使用redis存储URL队列。通过使用同一台redis服务器存储URL,webmagic可以很容易的在多机部署,从而达到分布式爬虫的效果。 + +#### Pipeline(后续处理和持久化) + +**Pipeline**是最终抽取结果进行输出和持久化的接口。它只包括一个方法: + +* public void process(ResultItems resultItems,Task task) + + **ResultItems**是集成了抽取结果的对象。通过ResultItems.get(key)可以获取抽取结果。Task同样是用于区分不同任务的对象。 + +webmagic包括以下几个Pipeline的实现: + +* ConsolePipeline + + 直接输出结果到控制台,测试时使用。 + +* FilePipeline + + 输出结果到文件,每个URL单独保存到一个页面,以URL的MD5结果作为文件名。通过构造函数`public FilePipeline(String path)`定义存储路径,**以下使用文件持久化的类,多数都使用此方法指定路径**。 + +* JsonFilePipeline + + 以JSON输出结果到文件(.json后缀),其他与FilePipeline相同。 + +webmagic目前不支持持久化到数据库,但是结合其他工具,持久化到数据库也是很容易的。这里不妨看一下[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。因为JFinal目前还不支持maven,所以这段代码并没有放到webmagic-samples里来。 + +
+ +## webmagic-extension + +webmagic-extension是为了开发爬虫更方便而实现的一些功能模块。这些功能完全基于webmagic-core的框架,包括注解形式编写爬虫、分页、分布式等功能。 + +### 注解模块 + +webmagic-extension包括注解模块。为什么会有注解方式? + +因为PageProcessor的方式灵活、强大,但是没有解决两个问题: + +* 对于一个站点,如果想抓取多种格式的URL,那么必须在PageProcesser中写判断逻辑,代码难以管理。 +* 抓取结果没有对应Model,并不符合Java程序开发习惯,与一些框架也无法很好整合。 + +注解的核心是Model类,本身是一个POJO,这个Model类用于传递、保存页面最终抓取结果数据。注解方式直接将抽取与数据绑定,以便于编写和维护。 + +注解方式其实也是通过一个PageProcessor的实现--ModelPageProcessor完成,因此对webmagic-core代码没有任何影响。 + +注解部分包括以下内容: + +* #### TargetUrl + + "TargetUrl"表示这个Model对应要抓取的URL,它包含两层意思:符合这个条件的URL会被加入抓取队列;符合这个条件的URL会被这个Model抓取。TargetUrl可以**sourceRegion**指定提取URL的区域(仅支持XPath)。 + + TargetUrl使用了正则表达式,匹配 "http://my.oschina.net/flashsword/blog/150039" 格式的URL。webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。 + + 与TargetUrl相似的还有**HelpUrl**,HelpUrl表示:仅仅抓取该URL用作链接提取,并不对它进行内容抽取。例如博客正文页对应TargetUrl,而列表页则对应HelpUrl。 + +* #### ExtractBy + + * ##### 用于字段 + + "ExtractBy"可用于类以及字段。用于字段时,定义了字段抽取的规则。抽取的规则默认使用[**XPath**](http://www.w3school.com.cn/xpath/),也可以选择使用CSS Selector、正则表达式(通过设置type)。 + + ExtractBy还有几个扩展属性。**multi**表示是否抽取列表,当然,设置为multi时,你需要一个List字段去容纳它。**notnull**则表示,此字段不允许为null,若为null则放弃整个对象。 + + * ##### 用于类 + "ExtractBy"用于类时,则限定了字段抽取的区域。用于类时仍支持multi,multi则表示一个页面可以抽取到多个对象。 + + * ##### ExtractByRaw & ExtractByUrl + + 在类使用"ExtractBy"修饰后,字段的"ExtractBy"使用的是其抽取的结果,如果仍然想要抽取原HTML,可以使用"ExtractByRaw"。与此类似的还有"ExtractByUrl",表示从URL中抽取信息。ExtractByUrl只支持正则表达式。 + + * ##### ExtractBy2 ExtractBy3 + + "ExtractBy"、"ExtractByRaw"支持链式抽取,通过增加注解"ExtractBy2"、"ExtractBy3"实现。 + +* #### AfterExtractor + + AfterExtractor接口是对注解方式抽取能力不足的补充。实现AfterExtractor接口后,会在**使用注解方式填充完字段后**调用**afterProcess()**方法,在这个方法中可以直接访问已抽取的字段、补充需要抽取的字段,甚至做一些简单的输出和持久化操作(并不是很建议这么做)。这部分可以参考[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。 + +* #### OOSpider + OOSpider是注解式爬虫的入口,这里调用**create()**方法将OschinaBlog这个类加入到爬虫的抽取中,这里是可以传入多个类的,例如: + + OOSpider.create( + Site.me().addStartUrl("http://www.oschina.net"), + new ConsolePageModelPipeline(), + OschinaBlog.clas,OschinaAnswer.class).run(); + + OOSpider会根据TargetUrl调用不同的Model进行解析。 + +* #### PageModelPipeline + 可以通过定义PageModelPipeline来选择结果输出方式。这里new ConsolePageModelPipeline()是PageModelPipeline的一个实现,会将结果输出到控制台。 + +* #### 分页 + + 处理单项数据分页(例如单条新闻多个页面)是爬虫一个比较头疼的问题。webmagic目前对于分页的解决方案是:在注解模式下,Model通过实现**PagedModel**接口,并引入PagedPipeline作为第一个Pipeline来实现。具体可以参考webmagic-samples中抓取网易新闻的代码:**us.codecraft.webmagic.model.samples.News163**。 + + 关于分页,这里有一篇对于webmagic分页实现的详细说明的文章[关于爬虫实现分页的一些思考](http://my.oschina.net/flashsword/blog/150039)。 + 目前分页功能还没有分布式实现,如果实现RedisScheduler进行分布式爬取,请不要使用分页功能。 + +### 分布式 + +webmagic-extension中,通过redis来管理URL,达到分布式的效果。但是对于分布式爬虫,仅仅程序能够分布式运行,还满足不了大规模抓取的需要,webmagic可能后期会加入一些任务管理和监控的功能,也欢迎各位用户为webmagic提交代码,做出贡献。 + + diff --git a/webmagic-core/README.md b/webmagic-core/README.md index 4964e16..90a6f0a 100644 --- a/webmagic-core/README.md +++ b/webmagic-core/README.md @@ -1,3 +1,3 @@ webmagic-core ------- -webmagic核心部分。 \ No newline at end of file +webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 \ No newline at end of file diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 60c37c0..cf42d2a 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -5,7 +5,7 @@ us.codecraft webmagic - 0.1.0 + 0.2.0 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 40f17f0..eb2c132 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -9,7 +9,7 @@ import java.util.List; /** *
- *Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
+ * Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
  *
  *     主要方法:
  *     {@link #getUrl()} 获取页面的Url
@@ -19,6 +19,7 @@ import java.util.List;
  *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
  *
  * 
+ * * @author code4crafter@gmail.com
*/ public class Page { @@ -36,9 +37,16 @@ public class Page { public Page() { } + public Page setSkip(boolean skip) { + resultItems.setSkip(skip); + return this; + + } + /** * 保存抽取的结果 - * @param key 结果的key + * + * @param key 结果的key * @param field 结果的value */ public void putField(String key, Object field) { @@ -47,6 +55,7 @@ public class Page { /** * 获取页面的html内容 + * * @return html 页面的html内容 */ public Selectable getHtml() { @@ -63,6 +72,7 @@ public class Page { /** * 添加待抓取的链接 + * * @param requests 待抓取的链接 */ public void addTargetRequests(List requests) { @@ -79,6 +89,7 @@ public class Page { /** * 添加待抓取的链接 + * * @param requestString 待抓取的链接 */ public void addTargetRequest(String requestString) { @@ -93,6 +104,7 @@ public class Page { /** * 添加待抓取的页面,在需要传递附加信息时使用 + * * @param request 待抓取的页面 */ public void addTargetRequest(Request request) { @@ -103,6 +115,7 @@ public class Page { /** * 获取页面的Url + * * @return url 当前页面的url,可用于抽取 */ public Selectable getUrl() { @@ -111,6 +124,7 @@ public class Page { /** * 设置url + * * @param url */ public void setUrl(Selectable url) { @@ -119,6 +133,7 @@ public class Page { /** * 获取抓取请求 + * * @return request 抓取请求 */ public Request getRequest() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 42dd079..905dbe5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,5 +1,9 @@ package us.codecraft.webmagic; +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; + /** * Request对象封装了待抓取的url信息。
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
@@ -18,40 +22,95 @@ package us.codecraft.webmagic; * String linktext = (String)page.getRequest().getExtra()[0]; * } * + * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午11:37 + * Date: 13-4-21 + * Time: 上午11:37 */ -public class Request { +public class Request implements Serializable { + + private static final long serialVersionUID = 2062192774891352043L; private String url; - private Object[] extra; - /** - * 构建一个request对象 - * @param url 必须参数,待抓取的url - * @param extra 额外参数,可以保存一些需要的上下文信息 + * 额外参数,可以保存一些需要的上下文信息 */ - public Request(String url, Object... extra) { - this.url = url; - this.extra = extra; + private Map extras; + + private double priority; + + public Request() { } /** - * 获取预存的对象 - * @return object[] 预存的对象数组 + * 构建一个request对象 + * + * @param url 必须参数,待抓取的url */ - public Object[] getExtra() { - return extra; + public Request(String url) { + this.url = url; + } + + public double getPriority() { + return priority; + } + + public Request setPriority(double priority) { + this.priority = priority; + return this; + } + + public Object getExtra(String key) { + if (extras == null) { + return null; + } + return extras.get(key); + } + + public Request putExtra(String key, Object value) { + if (extras == null) { + extras = new HashMap(); + } + extras.put(key, value); + return this; } /** * 获取待抓取的url + * * @return url 待抓取的url */ public String getUrl() { return url; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Request request = (Request) o; + + if (!url.equals(request.url)) return false; + + return true; + } + + public Map getExtras() { + return extras; + } + + @Override + public int hashCode() { + return url.hashCode(); + } + + public void setExtras(Map extras) { + this.extras = extras; + } + + public void setUrl(String url) { + this.url = url; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 0c1d94c..7a8e5c3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -5,8 +5,8 @@ import java.util.Map; /** * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。
- * @author yihua.huang@dianping.com
- * @date: 13-7-25
+ * @author code4crafter@gmail.com
+ * Date: 13-7-25
* Time: 下午12:20
*/ public class ResultItems { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 2c6118c..9ab97fe 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic; +import us.codecraft.webmagic.utils.UrlUtils; + import java.util.*; /** @@ -90,6 +92,11 @@ public class Site { * @return 已设置的domain */ public String getDomain() { + if (domain == null) { + if (startUrls.size() > 0) { + domain = UrlUtils.getDomain(startUrls.get(0)); + } + } return domain; } @@ -150,6 +157,7 @@ public class Site { /** * 获取初始页面的地址列表 + * * @return 初始页面的地址列表 */ public List getStartUrls() { @@ -158,6 +166,7 @@ public class Site { /** * 增加初始页面的地址,可反复调用此方法增加多个初始地址。 + * * @param startUrl 初始页面的地址 * @return this */ @@ -179,6 +188,7 @@ public class Site { /** * 获取两次抓取之间的间隔 + * * @return 两次抓取之间的间隔,单位毫秒 */ public int getSleepTime() { @@ -187,6 +197,7 @@ public class Site { /** * 获取重新下载的次数,默认为0 + * * @return 重新下载的次数 */ public int getRetryTimes() { @@ -195,6 +206,7 @@ public class Site { /** * 设置获取重新下载的次数,默认为0 + * * @return this */ public Site setRetryTimes(int retryTimes) { @@ -219,7 +231,7 @@ public class Site { return true; } - public Task toTask(){ + public Task toTask() { return new Task() { @Override public String getUUID() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a25fd02..cf62796 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -8,8 +8,8 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.schedular.QueueScheduler; -import us.codecraft.webmagic.schedular.Scheduler; +import us.codecraft.webmagic.scheduler.QueueScheduler; +import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.utils.ThreadUtils; import java.util.ArrayList; @@ -228,8 +228,10 @@ public class Spider implements Runnable, Task { } pageProcessor.process(page); addRequest(page); - for (Pipeline pipeline : pipelines) { - pipeline.process(page.getResultItems(), this); + if (!page.getResultItems().isSkip()){ + for (Pipeline pipeline : pipelines) { + pipeline.process(page.getResultItems(), this); + } } sleep(site.getSleepTime()); } @@ -283,6 +285,11 @@ public class Spider implements Runnable, Task { return this; } + public Spider clearPipeline(){ + pipelines=new ArrayList(); + return this; + } + @Override public String getUUID() { if (uuid != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java index 4f07528..6dcbde1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Destroyable.java @@ -2,8 +2,8 @@ package us.codecraft.webmagic.downloader; /** * 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。
- * @author yihua.huang@dianping.com
- * @date: 13-7-26
+ * @author code4crafter@gmail.com
+ * Date: 13-7-26
* Time: 下午3:10
*/ public interface Destroyable { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 97470e0..e1648fe 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -7,29 +7,18 @@ import java.util.Map; /** * 命令行输出抽取结果。可用于测试。
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:45 + * Date: 13-4-21 + * Time: 下午1:45 */ -public class ConsolePipeline implements Pipeline{ +public class ConsolePipeline implements Pipeline { @Override - public void process(ResultItems resultItems,Task task) { - if (resultItems.isSkip()){ - return; - } - System.out.println("get page: "+resultItems.getRequest().getUrl()); + public void process(ResultItems resultItems, Task task) { + System.out.println("get page: " + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { - if (entry.getValue() instanceof Iterable) { - Iterable value = (Iterable) entry.getValue(); - System.out.println(entry.getKey() + ":"); - for (Object o : value) { - System.out.println(o); - } - } else { - System.out.println(entry.getKey() + ":\t" + entry.getValue()); - } - System.out.println(entry.getKey()+":\t"+entry.getValue()); + System.out.println(entry.getKey() + ":\t" + entry.getValue()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 01f8d8b..252ccd5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -20,12 +20,12 @@ import java.util.Map; */ public class FilePipeline implements Pipeline { - private String path = "/data/temp/webmagic/"; + private String path = "/data/webmagic/"; private Logger logger = Logger.getLogger(getClass()); /** - * 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/" + * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" */ public FilePipeline() { @@ -37,6 +37,9 @@ public class FilePipeline implements Pipeline { * @param path 文件保存路径 */ public FilePipeline(String path) { + if (!path.endsWith("/")&&!path.endsWith("\\")){ + path+="/"; + } this.path = path; } @@ -47,9 +50,6 @@ public class FilePipeline implements Pipeline { if (!file.exists()) { file.mkdirs(); } - if (resultItems.isSkip()) { - return; - } try { PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java similarity index 96% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 613e406..723b5f9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.schedular; +package us.codecraft.webmagic.scheduler; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java similarity index 95% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java index 8d9649b..fc39b45 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.schedular; +package us.codecraft.webmagic.scheduler; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java new file mode 100644 index 0000000..997b6cf --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.selector; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-3
+ * Time: 下午5:29
+ */ +public class AndSelector implements Selector { + + private List selectors = new ArrayList(); + + public AndSelector(Selector... selectors) { + for (Selector selector : selectors) { + this.selectors.add(selector); + } + } + + @Override + public String select(String text) { + for (Selector selector : selectors) { + if (text == null) { + return null; + } + text = selector.select(text); + } + return text; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + boolean first = true; + for (Selector selector : selectors) { + if (first) { + results = selector.selectList(text); + first = false; + } else { + List resultsTemp = new ArrayList(); + for (String result : results) { + resultsTemp.addAll(selector.selectList(result)); + } + results = resultsTemp; + if (results == null || results.size() == 0) { + return results; + } + } + } + return results; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java new file mode 100644 index 0000000..48f9fb9 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.selector; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-3
+ * Time: 下午5:29
+ */ +public class OrSelector implements Selector { + + private List selectors = new ArrayList(); + + public OrSelector(Selector... selectors) { + for (Selector selector : selectors) { + this.selectors.add(selector); + } + } + + @Override + public String select(String text) { + for (Selector selector : selectors) { + text = selector.select(text); + if (text!=null){ + return text; + } + } + return null; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + for (Selector selector : selectors) { + List strings = selector.selectList(text); + results.addAll(strings); + } + return results; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index 845c0b6..4af2b44 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -8,7 +8,7 @@ import java.util.List; * Date: 13-4-20 * Time: 下午8:02 */ -interface Selector { +public interface Selector { public String select(String text); diff --git a/webmagic-extension/README.md b/webmagic-extension/README.md new file mode 100644 index 0000000..71d3c48 --- /dev/null +++ b/webmagic-extension/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。 \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/pom.xml b/webmagic-extension/pom.xml similarity index 52% rename from webmagic-plugin/webmagic-misc/pom.xml rename to webmagic-extension/pom.xml index c545615..63034f2 100644 --- a/webmagic-plugin/webmagic-misc/pom.xml +++ b/webmagic-extension/pom.xml @@ -4,24 +4,33 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> us.codecraft - webmagic-plugin - 0.1.0 + webmagic + 0.2.0 4.0.0 - webmagic-misc + webmagic-extension - org.freemarker - freemarker - 2.3.15 + com.alibaba + fastjson + 1.1.35 redis.clients jedis 2.0.0 + + us.codecraft + webmagic-core + ${project.version} + + + junit + junit + \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java new file mode 100644 index 0000000..7d46cc2 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/PagedModel.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic; + +import java.util.Collection; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-4
+ * Time: 下午5:18
+ */ +public interface PagedModel { + + public String getPageKey(); + + public Collection getOtherPages(); + + public String getPage(); + + public PagedModel combine(PagedModel pagedModel); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java index 722a2eb..cca5b20 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java @@ -34,6 +34,9 @@ public class FileDownloader implements Downloader { } public FileDownloader(String path, Downloader downloaderWhenFileMiss) { + if (!path.endsWith("/")&&!path.endsWith("\\")){ + path+="/"; + } this.path = path; this.downloaderWhenFileMiss = downloaderWhenFileMiss; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java new file mode 100644 index 0000000..3927d11 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java @@ -0,0 +1,15 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Page; + +/** + * 实现这个接口即可在抽取后进行后处理。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-8-3
+ * Time: 上午9:42
+ */ +public interface AfterExtractor { + + public void afterProcess(Page page); +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java new file mode 100644 index 0000000..c841f10 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.model; + +import org.apache.commons.lang3.builder.ToStringBuilder; +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-3
+ * Time: 下午3:41
+ */ +public class ConsolePageModelPipeline implements PageModelPipeline { + @Override + public void process(Object o, Task task) { + System.out.println(ToStringBuilder.reflectionToString(o)); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java new file mode 100644 index 0000000..0494076 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.selector.Selector; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午9:48
+ */ +class Extractor { + + protected Selector selector; + + protected final Source source; + + protected final boolean notNull; + + protected final boolean multi; + + static enum Source {Html, Url, RawHtml} + + public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { + this.selector = selector; + this.source = source; + this.notNull = notNull; + this.multi = multi; + } + + Selector getSelector() { + return selector; + } + + Source getSource() { + return source; + } + + boolean isNotNull() { + return notNull; + } + + boolean isMulti() { + return multi; + } + + void setSelector(Selector selector) { + this.selector = selector; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java new file mode 100644 index 0000000..4ec1bbc --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.selector.Selector; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午9:48
+ */ +class FieldExtractor extends Extractor{ + + private final Field field; + + private Method setterMethod; + + public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) { + super(selector, source, notNull,multi); + this.field = field; + } + + Field getField() { + return field; + } + + Selector getSelector() { + return selector; + } + + Source getSource() { + return source; + } + + void setSetterMethod(Method setterMethod) { + this.setterMethod = setterMethod; + } + + Method getSetterMethod() { + return setterMethod; + } + + boolean isNotNull() { + return notNull; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java new file mode 100644 index 0000000..af762ec --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -0,0 +1,89 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Selector; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * 基于PageProcessor的扩展点。
+ * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午8:46
+ */ +class ModelPageProcessor implements PageProcessor { + + private List pageModelExtractorList = new ArrayList(); + + private Site site; + + private Set targetUrlPatterns = new HashSet(); + + public static ModelPageProcessor create(Site site, Class... clazzs) { + ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); + for (Class clazz : clazzs) { + modelPageProcessor.addPageModel(clazz); + } + return modelPageProcessor; + } + + + public ModelPageProcessor addPageModel(Class clazz) { + PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); + targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); + targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); + pageModelExtractorList.add(pageModelExtractor); + return this; + } + + private ModelPageProcessor(Site site) { + this.site = site; + } + + @Override + public void process(Page page) { + for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { + extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); + extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); + Object process = pageModelExtractor.process(page); + if (process == null || (process instanceof List && ((List) process).size() == 0)) { + page.getResultItems().setSkip(true); + } + postProcessPageModel(pageModelExtractor.getClazz(), process); + page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); + } + } + + private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) { + List links; + if (urlRegionSelector == null) { + links = page.getHtml().links().all(); + } else { + links = urlRegionSelector.selectList(page.getHtml().toString()); + } + for (String link : links) { + for (Pattern targetUrlPattern : urlPatterns) { + Matcher matcher = targetUrlPattern.matcher(link); + if (matcher.find()) { + page.addTargetRequest(new Request(matcher.group(1))); + } + } + } + } + + protected void postProcessPageModel(Class clazz, Object object) { + } + + @Override + public Site getSite() { + return site; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java new file mode 100644 index 0000000..07d6c5a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.pipeline.Pipeline; + +import java.lang.annotation.Annotation; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * 基于Pipeline的扩展点,用于实现注解格式的Pipeline。
+ * 与PageModelPipeline是一对多的关系(原谅作者没有更好的名字了)。
+ * @author code4crafter@gmail.com
+ * Date: 13-8-2
+ * Time: 上午10:47
+ */ +class ModelPipeline implements Pipeline { + + private Map pageModelPipelines = new ConcurrentHashMap(); + + public ModelPipeline() { + } + + public ModelPipeline put(Class clazz, PageModelPipeline pageModelPipeline) { + pageModelPipelines.put(clazz, pageModelPipeline); + return this; + } + + @Override + public void process(ResultItems resultItems, Task task) { + for (Map.Entry classPageModelPipelineEntry : pageModelPipelines.entrySet()) { + Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); + if (o != null) { + Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class); + if (annotation == null || !((ExtractBy) annotation).multi()) { + classPageModelPipelineEntry.getValue().process(o, task); + } else { + List list = (List) o; + for (Object o1 : list) { + classPageModelPipelineEntry.getValue().process(o1, task); + } + } + } + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java new file mode 100644 index 0000000..e5a41e1 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -0,0 +1,56 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; + +/** + * 基于Model的Spider,封装后的入口类。
+ * @author code4crafter@gmail.com
+ * Date: 13-8-3
+ * Time: 上午9:51
+ */ +public class OOSpider extends Spider { + + private ModelPageProcessor modelPageProcessor; + + private ModelPipeline modelPipeline; + + protected OOSpider(ModelPageProcessor modelPageProcessor) { + super(modelPageProcessor); + this.modelPageProcessor = modelPageProcessor; + } + + /** + * 创建一个爬虫。
+ * @param site + * @param pageModelPipeline + * @param pageModels + */ + public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + this(ModelPageProcessor.create(site, pageModels)); + this.modelPipeline = new ModelPipeline(); + super.pipeline(modelPipeline); + if (pageModelPipeline!=null){ + for (Class pageModel : pageModels) { + this.modelPipeline.put(pageModel, pageModelPipeline); + } + } + } + + public static OOSpider create(Site site, Class... pageModels) { + return new OOSpider(site, null, pageModels); + } + + public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + return new OOSpider(site, pageModelPipeline, pageModels); + } + + public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { + for (Class pageModel : pageModels) { + modelPageProcessor.addPageModel(pageModel); + modelPipeline.put(pageModel, pageModelPipeline); + } + return this; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java new file mode 100644 index 0000000..2f9004b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -0,0 +1,355 @@ +package us.codecraft.webmagic.model; + +import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.annotation.*; +import us.codecraft.webmagic.selector.*; + +import java.lang.annotation.Annotation; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +/** + * Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午9:33
+ */ +class PageModelExtractor { + + private List targetUrlPatterns = new ArrayList(); + + private Selector targetUrlRegionSelector; + + private List helpUrlPatterns = new ArrayList(); + + private Selector helpUrlRegionSelector; + + private Class clazz; + + private List fieldExtractors; + + private Extractor extractor; + + public static PageModelExtractor create(Class clazz) { + PageModelExtractor pageModelExtractor = new PageModelExtractor(); + pageModelExtractor.init(clazz); + return pageModelExtractor; + } + + private void init(Class clazz) { + this.clazz = clazz; + initClassExtractors(); + fieldExtractors = new ArrayList(); + for (Field field : clazz.getDeclaredFields()) { + field.setAccessible(true); + FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field); + FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field); + if (fieldExtractor != null && fieldExtractorTmp != null) { + throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + } else if (fieldExtractor == null && fieldExtractorTmp != null) { + fieldExtractor = fieldExtractorTmp; + } + // ExtractBy2 & ExtractBy3 + if (fieldExtractor!=null){ + addAnnotationExtractBy2(fieldExtractor); + addAnnotationExtractBy3(fieldExtractor); + } + fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); + if (fieldExtractor != null && fieldExtractorTmp != null) { + throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + } else if (fieldExtractor == null && fieldExtractorTmp != null) { + fieldExtractor = fieldExtractorTmp; + } + if (fieldExtractor != null) { + if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + fieldExtractors.add(fieldExtractor); + } + } + } + + private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; + ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); + if (extractByUrl != null) { + String regexPattern = extractByUrl.value(); + if (regexPattern.trim().equals("")) { + regexPattern = ".*"; + } + fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + } + return fieldExtractor; + } + + private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; + ExtractBy extractBy = field.getAnnotation(ExtractBy.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + } + return fieldExtractor; + } + + private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) { + ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); + } + } + + private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) { + ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class); + if (extractBy != null) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); + } + } + + private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) { + FieldExtractor fieldExtractor = null; + ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class); + if (extractByRaw != null) { + String value = extractByRaw.value(); + Selector selector; + switch (extractByRaw.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); + Method setterMethod = getSetterMethod(clazz, field); + if (setterMethod != null) { + fieldExtractor.setSetterMethod(setterMethod); + } + } + return fieldExtractor; + } + + public static Method getSetterMethod(Class clazz, Field field) { + String name = "set" + StringUtils.capitalize(field.getName()); + try { + Method declaredMethod = clazz.getDeclaredMethod(name, field.getType()); + declaredMethod.setAccessible(true); + return declaredMethod; + } catch (NoSuchMethodException e) { + return null; + } + } + + private void initClassExtractors() { + Annotation annotation = clazz.getAnnotation(TargetUrl.class); + if (annotation == null) { + targetUrlPatterns.add(Pattern.compile(".*")); + } else { + TargetUrl targetUrl = (TargetUrl) annotation; + String[] value = targetUrl.value(); + for (String s : value) { + targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); + } + if (!targetUrl.sourceRegion().equals("")) { + targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion()); + } + } + annotation = clazz.getAnnotation(HelpUrl.class); + if (annotation != null) { + HelpUrl helpUrl = (HelpUrl) annotation; + String[] value = helpUrl.value(); + for (String s : value) { + helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); + } + if (!helpUrl.sourceRegion().equals("")) { + helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion()); + } + } + annotation = clazz.getAnnotation(ExtractBy.class); + if (annotation != null) { + ExtractBy extractBy = (ExtractBy) annotation; + extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + } + } + + public Object process(Page page) { + boolean matched = false; + for (Pattern targetPattern : targetUrlPatterns) { + if (targetPattern.matcher(page.getUrl().toString()).matches()) { + matched = true; + } + } + if (!matched) { + return null; + } + if (extractor == null) { + return processSingle(page, page.getHtml().toString()); + } else { + if (extractor.multi) { + List os = new ArrayList(); + List list = extractor.getSelector().selectList(page.getHtml().toString()); + for (String s : list) { + Object o = processSingle(page, s); + if (o != null) { + os.add(o); + } + } + return os; + } else { + String select = extractor.getSelector().select(page.getHtml().toString()); + Object o = processSingle(page, select); + return o; + } + } + } + + private Object processSingle(Page page, String html) { + Object o = null; + try { + o = clazz.newInstance(); + for (FieldExtractor fieldExtractor : fieldExtractors) { + if (fieldExtractor.isMulti()) { + List value; + switch (fieldExtractor.getSource()) { + case RawHtml: + value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + break; + case Html: + value = fieldExtractor.getSelector().selectList(html); + break; + case Url: + value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); + break; + default: + value = fieldExtractor.getSelector().selectList(html); + } + if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { + return null; + } + setField(o, fieldExtractor, value); + } else { + String value; + switch (fieldExtractor.getSource()) { + case RawHtml: + value = fieldExtractor.getSelector().select(page.getHtml().toString()); + break; + case Html: + value = fieldExtractor.getSelector().select(html); + break; + case Url: + value = fieldExtractor.getSelector().select(page.getUrl().toString()); + break; + default: + value = fieldExtractor.getSelector().select(html); + } + if (value == null && fieldExtractor.isNotNull()) { + return null; + } + setField(o, fieldExtractor, value); + } + } + if (AfterExtractor.class.isAssignableFrom(clazz)) { + ((AfterExtractor) o).afterProcess(page); + } + } catch (InstantiationException e) { + e.printStackTrace(); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } catch (InvocationTargetException e) { + e.printStackTrace(); + } + return o; + } + + private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { + if (fieldExtractor.getSetterMethod() != null) { + fieldExtractor.getSetterMethod().invoke(o, value); + } + fieldExtractor.getField().set(o, value); + } + + Class getClazz() { + return clazz; + } + + List getTargetUrlPatterns() { + return targetUrlPatterns; + } + + List getHelpUrlPatterns() { + return helpUrlPatterns; + } + + Selector getTargetUrlRegionSelector() { + return targetUrlRegionSelector; + } + + Selector getHelpUrlRegionSelector() { + return helpUrlRegionSelector; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java new file mode 100644 index 0000000..a70137f --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelPipeline.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Task; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-3
+ * Time: 上午9:34
+ */ +public interface PageModelPipeline { + + public void process(T t, Task task); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java new file mode 100644 index 0000000..8c12ce1 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -0,0 +1,50 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD, ElementType.TYPE}) +public @interface ExtractBy { + + /** + * 抽取规则 + * + * @return 抽取规则 + */ + String value(); + + public enum Type {XPath, Regex, Css} + + /** + * 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath + * + * @return 抽取规则类型 + */ + Type type() default Type.XPath; + + /** + * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * + * @return 是否是不能为空的关键字段 + */ + boolean notNull() default false; + + /** + * 是否抽取多个结果
+ * 用于字段时,需要List来盛放结果
+ * 用于类时,表示单页抽取多个对象
+ * + * @return 是否抽取多个结果 + */ + boolean multi() default false; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java new file mode 100644 index 0000000..2a4f080 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractBy2 { + + String value(); + + public enum Type {XPath, Regex, Css} + + Type type() default Type.XPath; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java new file mode 100644 index 0000000..741682d --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
+ * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractBy3 { + + String value(); + + public enum Type { XPath, Regex, Css} + + Type type() default Type.XPath; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java new file mode 100644 index 0000000..a3ae3e5 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD, ElementType.TYPE}) +public @interface ExtractByRaw { + + /** + * 抽取规则 + * + * @return 抽取规则 + */ + String value(); + + public enum Type {XPath, Regex, Css} + + /** + * 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath + * + * @return 抽取规则类型 + */ + Type type() default Type.XPath; + + /** + * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * + * @return 是否是不能为空的关键字段 + */ + boolean notNull() default false; + + /** + * 是否抽取多个结果
+ * 需要List来盛放结果
+ * + * @return 是否抽取多个结果 + */ + boolean multi() default false; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java new file mode 100644 index 0000000..51b5f0d --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义类或者字段的抽取规则(从url中抽取,只支持正则表达式)。
+ * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface ExtractByUrl{ + + /** + * 抽取规则,支持正则表达式 + * + * @return 抽取规则 + */ + String value() default ""; + + /** + * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * + * @return 是否是不能为空的关键字段 + */ + boolean notNull() default false; + + /** + * 是否抽取多个结果
+ * 用于字段时,需要List来盛放结果
+ * 用于类时,表示单页抽取多个对象
+ * + * @return 是否抽取多个结果 + */ + boolean multi() default false; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java new file mode 100644 index 0000000..9a0cce4 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java @@ -0,0 +1,30 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义辅助爬取的url。
+ * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface HelpUrl { + + /** + * 某个类对应的URL规则列表
+ * webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
+ * + * @return 抽取规则 + */ + String[] value(); + + /** + * 指定提取URL的区域(仅支持XPath) + * @return 指定提取URL的区域 + */ + String sourceRegion() default ""; +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java new file mode 100644 index 0000000..e12fca3 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-8-1
+ * Time: 下午8:40
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE}) +public @interface TargetUrl { + + /** + * 某个类对应的URL规则列表
+ * webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
+ * + * @return 抽取规则 + */ + String[] value(); + + /** + * 指定提取URL的区域(仅支持XPath) + * @return 指定提取URL的区域 + */ + String sourceRegion() default ""; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html new file mode 100644 index 0000000..1e3004f --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html @@ -0,0 +1,5 @@ + + +webmagic注解抓取方式所定义的注解。 + + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html new file mode 100644 index 0000000..d62cc00 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html @@ -0,0 +1,5 @@ + + +webmagic对抓取器编写的面向模型(称为PageModel)的封装。基于POJO及注解即可实现一个PageProcessor。 + + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java new file mode 100644 index 0000000..53dba9e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.pipeline; + +import com.alibaba.fastjson.JSON; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; + +/** + * JSON格式持久化到文件的接口。 + * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午6:28 + */ +public class JsonFilePipeline implements Pipeline { + + private String path = "/data/webmagic/"; + + private Logger logger = Logger.getLogger(getClass()); + + /** + * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" + */ + public JsonFilePipeline() { + + } + + /** + * 新建一个FilePipeline + * + * @param path 文件保存路径 + */ + public JsonFilePipeline(String path) { + if (!path.endsWith("/")&&!path.endsWith("\\")){ + path+="/"; + } + this.path = path; + } + + @Override + public void process(ResultItems resultItems, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + File file = new File(path); + if (!file.exists()) { + file.mkdirs(); + } + try { + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")); + printWriter.write(JSON.toJSONString(resultItems.getAll())); + printWriter.close(); + } catch (IOException e) { + logger.warn("write file error", e); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java new file mode 100644 index 0000000..beda667 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java @@ -0,0 +1,84 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.PagedModel; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.utils.DoubleKeyMap; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; + +/** + * 用于实现分页的Pipeline。
+ * 在使用redis做分布式爬虫时,请不要使用此功能。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-8-4
+ * Time: 下午5:15
+ */ +public class PagedPipeline implements Pipeline { + + private DoubleKeyMap pageMap = new DoubleKeyMap(ConcurrentHashMap.class); + + private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class); + + @Override + public void process(ResultItems resultItems, Task task) { + Map resultItemsAll = resultItems.getAll(); + Iterator> iterator = resultItemsAll.entrySet().iterator(); + while (iterator.hasNext()) { + handleObject(iterator); + } + } + + private void handleObject(Iterator> iterator) { + Map.Entry objectEntry = iterator.next(); + Object o = objectEntry.getValue(); + if (o instanceof PagedModel) { + PagedModel pagedModel = (PagedModel) o; + pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE); + if (pagedModel.getOtherPages() != null) { + for (String otherPage : pagedModel.getOtherPages()) { + Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); + if (aBoolean == null) { + pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); + } + } + } + //check if all pages are processed + Map booleanMap = pageMap.get(pagedModel.getPageKey()); + objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel); + if (booleanMap == null) { + return; + } + for (Map.Entry stringBooleanEntry : booleanMap.entrySet()) { + if (!stringBooleanEntry.getValue()) { + iterator.remove(); + return; + } + } + List> entryList = new ArrayList>(); + entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet()); + if (entryList.size() != 0) { + Collections.sort(entryList, new Comparator>() { + @Override + public int compare(Map.Entry o1, Map.Entry o2) { + try { + int i1 = Integer.parseInt(o1.getKey()); + int i2 = Integer.parseInt(o2.getKey()); + return i1 - i2; + } catch (NumberFormatException e) { + return o1.getKey().compareTo(o2.getKey()); + } + } + }); + PagedModel value = entryList.get(0).getValue(); + for (int i = 1; i < entryList.size(); i++) { + value = value.combine(entryList.get(i).getValue()); + } + objectEntry.setValue(value); + } + } + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java similarity index 96% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index f5393a3..a8dc23a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.schedular; +package us.codecraft.webmagic.scheduler; import org.apache.commons.lang3.math.NumberUtils; import org.apache.log4j.Logger; @@ -46,6 +46,9 @@ public class FileCacheQueueScheduler implements Scheduler { private Set urls; public FileCacheQueueScheduler(String filePath) { + if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){ + filePath+="/"; + } this.filePath = filePath; } diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java similarity index 61% rename from webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 382642b..e7c5bcd 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,17 +1,18 @@ package us.codecraft.webmagic.scheduler; +import com.alibaba.fastjson.JSON; +import org.apache.commons.codec.digest.DigestUtils; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.schedular.Scheduler; /** * 使用redis管理url,构建一个分布式的爬虫。
* - * @author yihua.huang@dianping.com
- * @date: 13-7-25
+ * @author code4crafter@gmail.com
+ * Date: 13-7-25
* Time: 上午7:07
*/ public class RedisScheduler implements Scheduler { @@ -22,6 +23,8 @@ public class RedisScheduler implements Scheduler { private static final String SET_PREFIX = "set_"; + private static final String ITEM_PREFIX = "item_"; + public RedisScheduler(String host) { pool = new JedisPool(new JedisPoolConfig(), host); } @@ -33,7 +36,12 @@ public class RedisScheduler implements Scheduler { if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) { //使用List保存队列 jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); - jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl()); + jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); + if (request.getExtras() != null) { + String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl()); + byte[] bytes = JSON.toJSONString(request).getBytes(); + jedis.set(key.getBytes(), bytes); + } } pool.returnResource(jedis); } @@ -42,10 +50,16 @@ public class RedisScheduler implements Scheduler { public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); - pool.returnResource(jedis); - if (url==null){ + if (url == null) { return null; } + String key = ITEM_PREFIX + DigestUtils.shaHex(url); + byte[] bytes = jedis.get(key.getBytes()); + if (bytes != null) { + Request o = JSON.parseObject(new String(bytes),Request.class); + return o; + } + pool.returnResource(jedis); return new Request(url); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java new file mode 100755 index 0000000..b284a15 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -0,0 +1,111 @@ +package us.codecraft.webmagic.utils; + +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date Dec 14, 2012 + */ +public class DoubleKeyMap extends MultiKeyMapBase { + private Map> map; + + public DoubleKeyMap() { + init(); + } + + public DoubleKeyMap(Map> map) { + this(map,DEFAULT_CLAZZ); + } + + public DoubleKeyMap(Class protoMapClass) { + super(protoMapClass); + init(); + } + + private void init() { + if (map == null) { + map = this.>newMap(); + } + } + + /** + * init map with protoMapClass + * + * @param protoMapClass + */ + @SuppressWarnings("rawtypes") + public DoubleKeyMap(Map> map, Class protoMapClass) { + super(protoMapClass); + this.map = map; + init(); + } + + /** + * @param key + * @return map + */ + public Map get(K1 key) { + return map.get(key); + } + + /** + * @param key1 + * @param key2 + * @return value + */ + public V get(K1 key1, K2 key2) { + if (get(key1) == null) { + return null; + } + return get(key1).get(key2); + } + + + /** + * @param key1 + * @param submap + * @return + */ + public V put(K1 key1, Map submap) { + return put(key1, submap); + } + + /** + * @param key1 + * @param key2 + * @param value + * @return + */ + public V put(K1 key1, K2 key2, V value) { + if (map.get(key1) == null) { + map.put(key1, this.newMap()); + } + return get(key1).put(key2, value); + } + + /** + * @param key1 + * @param key2 + * @return + */ + public V remove(K1 key1, K2 key2) { + if (get(key1) == null) { + return null; + } + V remove = get(key1).remove(key2); + // 如果上一级map为空,把它也回收掉 + if (get(key1).size() == 0) { + remove(key1); + } + return remove; + } + + /** + * @param key1 + * @return + */ + public Map remove(K1 key1) { + Map remove = map.remove(key1); + return remove; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java new file mode 100755 index 0000000..89fdc9a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.utils; + +/** + * @author code4crafter@gmail.com + * Date Dec 14, 2012 + */ + +import java.util.HashMap; +import java.util.Map; + +/** + * multikey map, some basic objects * + * + * @author yihua.huang + */ +public abstract class MultiKeyMapBase { + + protected static final Class DEFAULT_CLAZZ = HashMap.class; + @SuppressWarnings("rawtypes") + private Class protoMapClass = DEFAULT_CLAZZ; + + public MultiKeyMapBase() { + } + + @SuppressWarnings("rawtypes") + public MultiKeyMapBase(Class protoMapClass) { + this.protoMapClass = protoMapClass; + } + + @SuppressWarnings("unchecked") + protected Map newMap() { + try { + return (Map) protoMapClass.newInstance(); + } catch (InstantiationException e) { + throw new IllegalArgumentException("wrong proto type map " + + protoMapClass); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("wrong proto type map " + + protoMapClass); + } + } +} \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java similarity index 74% rename from webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java index 6db21a8..0819e43 100644 --- a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -8,8 +8,8 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** - * @author yihua.huang@dianping.com
- * @date: 13-7-25
+ * @author code4crafter@gmail.com
+ * Date: 13-7-25
* Time: 上午7:51
*/ public class RedisSchedulerTest { @@ -35,8 +35,11 @@ public class RedisSchedulerTest { return null; } }; - redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task); + Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"); + request.putExtra("1","2"); + redisScheduler.push(request, task); Request poll = redisScheduler.poll(task); + System.out.println(poll); } } diff --git a/webmagic-lucene/README.md b/webmagic-lucene/README.md new file mode 100644 index 0000000..77050ab --- /dev/null +++ b/webmagic-lucene/README.md @@ -0,0 +1,3 @@ +webmagic-lucene +-------- +尝试将webmagic与lucene结合,打造一个搜索引擎。开发中,不作为webmagic主要模块。 \ No newline at end of file diff --git a/webmagic-lucene/pom.xml b/webmagic-lucene/pom.xml new file mode 100644 index 0000000..d7b4665 --- /dev/null +++ b/webmagic-lucene/pom.xml @@ -0,0 +1,37 @@ + + + + webmagic + us.codecraft + 0.2.0 + + 4.0.0 + + webmagic-lucene + + + + org.apache.lucene + lucene-analyzers-common + 4.4.0 + + + org.apache.lucene + lucene-queryparser + 4.4.0 + + + us.codecraft + webmagic-extension + ${project.version} + + + junit + junit + + + + + \ No newline at end of file diff --git a/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java new file mode 100644 index 0000000..6fe2702 --- /dev/null +++ b/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java @@ -0,0 +1,92 @@ +package us.codecraft.webmagic.pipeline; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-5
+ * Time: 下午2:11
+ */ +public class LucenePipeline implements Pipeline { + + private Directory directory; + + private Analyzer analyzer; + + private IndexWriterConfig config; + + private void init() throws IOException { + analyzer = new StandardAnalyzer(Version.LUCENE_44); + directory = new RAMDirectory(); + config = new IndexWriterConfig(Version.LUCENE_44, analyzer); + } + + public LucenePipeline() { + try { + init(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public List search(String fieldName, String value) throws IOException, ParseException { + List documents = new ArrayList(); + DirectoryReader ireader = DirectoryReader.open(directory); + IndexSearcher isearcher = new IndexSearcher(ireader); + // Parse a simple query that searches for "text": + QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer); + Query query = parser.parse(value); + ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + // Iterate through the results: + for (int i = 0; i < hits.length; i++) { + Document hitDoc = isearcher.doc(hits[i].doc); + documents.add(hitDoc); + } + ireader.close(); + return documents; + } + + @Override + public void process(ResultItems resultItems, Task task) { + if (resultItems.isSkip()){ + return; + } + Document doc = new Document(); + Map all = resultItems.getAll(); + if (all==null){ + return; + } + for (Map.Entry objectEntry : all.entrySet()) { + doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED)); + } + try { + IndexWriter indexWriter = new IndexWriter(directory, config); + indexWriter.addDocument(doc); + indexWriter.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java new file mode 100644 index 0000000..b350370 --- /dev/null +++ b/webmagic-lucene/src/main/test/java/us/codecraft/webmagic/lucene/OschinaBlog.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.lucene; + +import org.apache.lucene.document.Document; +import org.apache.lucene.queryparser.classic.ParseException; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.LucenePipeline; + +import java.io.IOException; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-2
+ * Time: 上午7:52
+ */ +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) + private String content; + + @Override + public String toString() { + return "OschinaBlog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } + + public static void main(String[] args) { + LucenePipeline pipeline = new LucenePipeline(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync(); + while (true) { + try { + List search = pipeline.search("title", "webmagic"); + System.out.println(search); + Thread.sleep(3000); + } catch (IOException e) { + e.printStackTrace(); + } catch (ParseException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } +} diff --git a/webmagic-plugin/README.md b/webmagic-plugin/README.md deleted file mode 100644 index 536d596..0000000 --- a/webmagic-plugin/README.md +++ /dev/null @@ -1,6 +0,0 @@ -webmagic-plugin -------- -webmagic的插件模块。 -目前仅实现了freemarker模板渲染,和redis实现分布式爬虫。 - -另外有一个使用Selenium来动态渲染页面的模块在开发中。 \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java deleted file mode 100644 index 9a045ef..0000000 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ /dev/null @@ -1,60 +0,0 @@ -package us.codecraft.webmagic.pipeline; - -import freemarker.template.Configuration; -import freemarker.template.Template; -import freemarker.template.TemplateException; -import org.apache.commons.codec.digest.DigestUtils; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; - -/** - * @author code4crafter@gmail.com
- * Date: 13-6-8 - * Time: 下午9:00 - */ -public class FreemarkerPipeline implements Pipeline { - - private Configuration configuration; - - private Template template; - - private String path = "/data/temp/webmagic/ftl/"; - - public FreemarkerPipeline(String template, String path) throws IOException { - configuration = new Configuration(); - configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile())); - this.template = configuration.getTemplate(template); - this.path = path; - new File(path); - } - - public FreemarkerPipeline(String template) throws IOException { - this(template, "/data/temp/webmagic/ftl/"); - } - - - @Override - public void process(ResultItems resultItems, Task task) { - if (resultItems.isSkip()) { - return; - } - String path = this.path + "" + task.getUUID() + "/"; - File file = new File(path); - if (!file.exists()) { - file.mkdirs(); - } - try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); - template.process(resultItems.getAll(), printWriter); - printWriter.close(); - } catch (TemplateException e) { - } catch (IOException e) { - e.printStackTrace(); - } - } -} diff --git a/webmagic-plugin/webmagic-misc/src/main/resources/ftl/wordpress.ftl b/webmagic-plugin/webmagic-misc/src/main/resources/ftl/wordpress.ftl deleted file mode 100644 index 61820b7..0000000 --- a/webmagic-plugin/webmagic-misc/src/main/resources/ftl/wordpress.ftl +++ /dev/null @@ -1,23 +0,0 @@ - - $it.Title - http://127.0.0.1/wordpress/?p=$it.Id - ${date} - admin - http://127.0.0.1/wordpress/?p=$it.Id - - - - <#--$it.Id--> - ${date} - ${date} - open - open - ${title} - publish - 0 - 0 - post - - 0 - $tags - \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java b/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java deleted file mode 100644 index 8ceb99f..0000000 --- a/webmagic-plugin/webmagic-misc/src/test/java/us/codecraft/webmagic/FreemarkerPipelineTest.java +++ /dev/null @@ -1,19 +0,0 @@ -package us.codecraft.webmagic; - -import org.junit.Test; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; - -import java.io.IOException; - -/** - * @author code4crafter@gmail.com
- * Date: 13-6-9 - * Time: 上午7:14 - */ -public class FreemarkerPipelineTest { - - @Test - public void testTemplateLoad() throws IOException { - new FreemarkerPipeline("wordpress.ftl"); - } -} diff --git a/webmagic-plugin/webmagic-selenium/README.md b/webmagic-plugin/webmagic-selenium/README.md deleted file mode 100644 index 5e5ce82..0000000 --- a/webmagic-plugin/webmagic-selenium/README.md +++ /dev/null @@ -1,3 +0,0 @@ -webmagic-selenium -------- -尝试使用selenium来进行页面动态渲染,开发中。 \ No newline at end of file diff --git a/webmagic-samples/README.md b/webmagic-samples/README.md new file mode 100644 index 0000000..7cdad18 --- /dev/null +++ b/webmagic-samples/README.md @@ -0,0 +1,3 @@ +webmagic-samples +------- +webmagic的一些示例。包括抓取常见博客、信息类网站等。 \ No newline at end of file diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 8a7e00c..9d00d2f 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ us.codecraft webmagic - 0.1.0 + 0.2.0 4.0.0 @@ -19,12 +19,7 @@ us.codecraft - webmagic-misc - ${project.version} - - - us.codecraft - webmagic-selenium + webmagic-extension ${project.version} @@ -33,4 +28,23 @@ + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + ./lib/ + us.codecraft.webmagic.main.QuickStarter + + + + + + + \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java new file mode 100644 index 0000000..52be272 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -0,0 +1,70 @@ +package us.codecraft.webmagic.main; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.samples.IteyeBlog; +import us.codecraft.webmagic.model.samples.News163; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.pipeline.ConsolePipeline; +import us.codecraft.webmagic.pipeline.PagedPipeline; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Scanner; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-7
+ * Time: 下午9:24
+ */ +public class QuickStarter { + + private static Map clazzMap; + + private static Map urlMap; + + private static void init(){ + clazzMap = new LinkedHashMap(); + clazzMap.put("1", OschinaBlog.class); + clazzMap.put("2", IteyeBlog.class); + clazzMap.put("3", News163.class); + urlMap = new LinkedHashMap(); + urlMap.put("1", "http://my.oschina.net/flashsword/blog"); + urlMap.put("2", "http://flashsword20.iteye.com/"); + urlMap.put("3", "http://news.163.com/"); + } + + public static void main(String[] args) { + init(); + String key = null; + key = readKey(key); + System.out.println("The demo started and will last 20 seconds..."); + //Start spider + OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync(); + + try { + Thread.sleep(20000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + System.out.println("The demo stopped!"); + System.out.println("To more usage, try to customize your own Spider!"); + System.exit(0); + } + + private static String readKey(String key) { + Scanner stdin = new Scanner(System.in); + System.out.println("Choose a Spider demo:"); + for (Map.Entry classEntry : clazzMap.entrySet()) { + System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey())); + } + while (key == null) { + key = new String(stdin.nextLine()); + if (clazzMap.get(key) == null) { + System.out.println("Invalid choice!"); + key = null; + } + } + return key; + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java new file mode 100644 index 0000000..509aaf9 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.model.samples; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-2
+ * Time: 上午8:10
+ */ +public interface Blog { + + public String getTitle(); + + public String getContent(); +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java new file mode 100644 index 0000000..ae94525 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-2
+ * Time: 上午7:52
+ */ +@TargetUrl("http://*.iteye.com/blog/*") +public class IteyeBlog implements Blog{ + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css) + private String content; + + @Override + public String toString() { + return "IteyeBlog{" + + "title='" + title + '\'' + + ", content='" + content + '\'' + + '}'; + } + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run(); + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java new file mode 100644 index 0000000..8c0e32d --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -0,0 +1,82 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.PagedModel; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.*; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractBy2; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.ConsolePipeline; +import us.codecraft.webmagic.pipeline.PagedPipeline; +import us.codecraft.webmagic.scheduler.RedisScheduler; + +import java.util.Collection; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-4
+ * Time: 下午8:17
+ */ +@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") +public class News163 implements PagedModel { + + @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html") + private String pageKey; + + @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) + private String page; + + @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false) + @ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex) + private List otherPage; + + @ExtractBy("//h1[@id=\"h1title\"]/text()") + private String title; + + @ExtractBy("//div[@id=\"epContentLeft\"]") + private String content; + + @Override + public String getPageKey() { + return pageKey; + } + + @Override + public Collection getOtherPages() { + return otherPage; + } + + @Override + public String getPage() { + if (page == null) { + return "1"; + } + return page; + } + + @Override + public PagedModel combine(PagedModel pagedModel) { + News163 news163 = new News163(); + news163.title = this.title; + News163 pagedModel1 = (News163) pagedModel; + news163.content = this.content + pagedModel1.content; + return news163; + } + + @Override + public String toString() { + return "News163{" + + "content='" + content + '\'' + + ", title='" + title + '\'' + + ", otherPage=" + otherPage + + '}'; + } + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) + .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java new file mode 100644 index 0000000..e878633 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.*; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.HelpUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-3
+ * Time: 下午8:25
+ */ +@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") +@HelpUrl("http://www.oschina.net/question/*") +@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true) +public class OschinaAnswer implements AfterExtractor{ + + @ExtractBy("//img/@title") + private String user; + + @ExtractBy("//div[@class='detail']") + private String content; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run(); + } + + @Override + public void afterProcess(Page page) { + + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java new file mode 100644 index 0000000..c1e3ea3 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -0,0 +1,33 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.ConsolePageModelPipeline; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-2
+ * Time: 上午7:52
+ */ +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") + ,new ConsolePageModelPipeline(), OschinaBlog.class).run(); + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java deleted file mode 100644 index 2bdf342..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ /dev/null @@ -1,49 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; - -import java.util.List; - -/** - * Author yihua.huang@dianping.com - * Date: 13-6-24 - * Time: 下午2:12 - */ -public class GlobalProcessor implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - final List requests = page.getHtml().links().all(); - page.addTargetRequests(requests); - - } - - @Override - public Site getSite() { - if (site == null) { - site = Site.me().setDomain("www.2345.com").setSleepTime(0) - .addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/") - .addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3") - .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/webmagic/test/")) - .runAsync(); - Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/webmagic/test/")) - .run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java deleted file mode 100644 index 54d995e..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java +++ /dev/null @@ -1,20 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.processor.SimplePageProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; - -/** - * @author yihua.huang@dianping.com
- * @date: 13-7-14
- * Time: 上午8:33
- */ -public class GuoxueProcessor { - - public static void main(String[] args) { - SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*"); - simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500); - Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index 76f9cc3..c0b3f73 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -7,8 +7,8 @@ import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; /** - * @author yihua.huang@dianping.com
- * @date: 13-7-26
+ * @author code4crafter@gmail.com
+ * Date: 13-7-26
* Time: 上午7:31
*/ public class IteyeBlogProcessor implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index f2dbe8e..8ba7063 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -2,6 +2,8 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -13,18 +15,24 @@ import java.util.List; */ public class OschinaBlogPageProcesser implements PageProcessor { + private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); + @Override public void process(Page page) { - List strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); - page.putField("content", page.getHtml().smartContent()); - page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+")); + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @Override public Site getSite() { - return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run(); } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 76a423f..dbfa815 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -5,7 +5,7 @@ import org.junit.Test; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java deleted file mode 100644 index 13910b5..0000000 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ /dev/null @@ -1,37 +0,0 @@ -package us.codecraft.webmagic.processor; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.ConsolePipeline; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; -import us.codecraft.webmagic.samples.DiandianBlogProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; - -import java.io.IOException; - -/** - * @author code4crafter@gmail.com
- * Date: 13-6-9 - * Time: 上午8:02 - */ -public class DiandianProcessorTest { - - @Ignore - @Test - public void test() throws IOException { - DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor(); - //pipeline是抓取结束后的处理 - //ftl文件放到classpath:ftl/文件夹下 - //默认放到/data/temp/webmagic/ftl/[domain]目录下 - FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); - //Spider.me()是简化写法,其实就是new一个啦 - //Spider.pipeline()设定一个pipeline,支持链式调用 - //ConsolePipeline输出结果到控制台 - //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 - //Spider.run()执行 - - Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). - run(); - } -} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 33bcf9c..0371eb2 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -4,9 +4,9 @@ import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; +import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.samples.DiaoyuwengProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; @@ -21,7 +21,7 @@ public class DiaoyuwengProcessorTest { @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); - FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); + JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index a0160e1..026f8d5 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -4,9 +4,9 @@ import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.pipeline.FreemarkerPipeline; +import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.samples.SinaBlogProcesser; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; @@ -22,9 +22,8 @@ public class SinablogProcessorTest { public void test() throws IOException { SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser(); //pipeline是抓取结束后的处理 - //ftl文件放到classpath:ftl/文件夹下 - //默认放到/data/temp/webmagic/ftl/[domain]目录下 - FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); + //默认放到/data/webmagic/ftl/[domain]目录下 + JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); //Spider.me()是简化写法,其实就是new一个啦 //Spider.pipeline()设定一个pipeline,支持链式调用 //ConsolePipeline输出结果到控制台 diff --git a/webmagic-saxon/README.md b/webmagic-saxon/README.md new file mode 100644 index 0000000..0471c68 --- /dev/null +++ b/webmagic-saxon/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。 \ No newline at end of file diff --git a/webmagic-plugin/pom.xml b/webmagic-saxon/pom.xml similarity index 76% rename from webmagic-plugin/pom.xml rename to webmagic-saxon/pom.xml index 2225722..a2db768 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-saxon/pom.xml @@ -5,16 +5,11 @@ us.codecraft webmagic - 0.1.0 + 0.2.0 - pom 4.0.0 - - webmagic-misc - webmagic-selenium - - webmagic-plugin + webmagic-saxon @@ -22,6 +17,10 @@ webmagic-core ${project.version} + + net.sf.saxon + Saxon-HE + junit junit diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java new file mode 100644 index 0000000..98b1efe --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -0,0 +1,178 @@ +package us.codecraft.webmagic.selector; + +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.xpath.XPathEvaluator; +import org.apache.log4j.Logger; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import javax.xml.namespace.NamespaceContext; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 上午9:39 + */ +public class Xpath2Selector implements Selector { + + private String xpathStr; + + private XPathExpression xPathExpression; + + private Logger logger = Logger.getLogger(getClass()); + + public Xpath2Selector(String xpathStr) { + this.xpathStr = xpathStr; + try { + init(); + } catch (XPathExpressionException e) { + throw new IllegalArgumentException("XPath error!", e); + } + } + + enum XPath2NamespaceContext implements NamespaceContext { + + INSTANCE; + + private final Map prefix2NamespaceMap = new ConcurrentHashMap(); + + private final Map> namespace2PrefixMap = new ConcurrentHashMap>(); + + private void put(String prefix, String namespaceURI) { + prefix2NamespaceMap.put(prefix, namespaceURI); + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null) { + prefixes = new ArrayList(); + namespace2PrefixMap.put(namespaceURI, prefixes); + } + prefixes.add(prefix); + } + + private XPath2NamespaceContext() { + put("fn", NamespaceConstant.FN); + put("xslt", NamespaceConstant.XSLT); + } + + @Override + public String getNamespaceURI(String prefix) { + return prefix2NamespaceMap.get(prefix); + } + + @Override + public String getPrefix(String namespaceURI) { + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null || prefixes.size() < 1) { + return null; + } + return prefixes.get(0); + } + + @Override + public Iterator getPrefixes(String namespaceURI) { + List prefixes = namespace2PrefixMap.get(namespaceURI); + if (prefixes == null || prefixes.size() < 1) { + return null; + } + return prefixes.iterator(); + } + } + + private void init() throws XPathExpressionException { + XPathEvaluator xPathEvaluator = new XPathEvaluator(); + xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE); + xPathExpression = xPathEvaluator.compile(xpathStr); + } + + @Override + public String select(String text) { + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + Object result; + try { + result = xPathExpression.evaluate(document, XPathConstants.NODESET); + } catch (XPathExpressionException e) { + result = xPathExpression.evaluate(document, XPathConstants.STRING); + } + if (result instanceof NodeList) { + NodeList nodeList = (NodeList) result; + if (nodeList.getLength() == 0) { + return null; + } + Node item = nodeList.item(0); + if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { + return item.getTextContent(); + } else { + StreamResult xmlOutput = new StreamResult(new StringWriter()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + transformer.transform(new DOMSource(item), xmlOutput); + return xmlOutput.getWriter().toString(); + } + } + return result.toString(); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public List selectList(String text) { + List results = new ArrayList(); + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + Object result; + try { + result = xPathExpression.evaluate(document, XPathConstants.NODESET); + } catch (XPathExpressionException e) { + result = xPathExpression.evaluate(document, XPathConstants.STRING); + } + if (result instanceof NodeList) { + NodeList nodeList = (NodeList) result; + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + StreamResult xmlOutput = new StreamResult(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + for (int i = 0; i < nodeList.getLength(); i++) { + Node item = nodeList.item(i); + if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { + results.add(item.getTextContent()); + } else { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(item), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } + } + } else { + results.add(result.toString()); + } + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return results; + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java similarity index 98% rename from webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java rename to webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 30d8a81..b623040 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.selector; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; /** @@ -1168,7 +1169,7 @@ public class XpathSelectorTest { + " var location = window.location;\n" + " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n" + " pre.writeAttribute('codeable_id', post_id);\n" - + " pre.writeAttribute('codeable_type', \"Blog\");\n" + + " pre.writeAttribute('codeable_type', \"OschinaBlog\");\n" + " pre.writeAttribute('source_url', source_url);\n" + " pre.writeAttribute('pre_index', index);\n" + " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n" @@ -1354,4 +1355,41 @@ public class XpathSelectorTest { Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); } + @Test + public void testXPath2() { + String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; + XpathSelector xpathSelector = new XpathSelector("//h1/text()"); + System.out.println(xpathSelector.select(text)); + } + + @Test + public void testXpath2Selector() { + Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); + String select = xpath2Selector.select(html); + Assert.assertNotNull(select); + } + + @Ignore("take long time") + @Test + public void performanceTest() { + Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); + long time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpath2Selector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + XpathSelector xpathSelector = new XpathSelector("//a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpathSelector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + xpath2Selector.selectList(html); + } + System.out.println(System.currentTimeMillis()-time); + } + } diff --git a/webmagic-selenium/README.md b/webmagic-selenium/README.md new file mode 100644 index 0000000..c8583c3 --- /dev/null +++ b/webmagic-selenium/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。 \ No newline at end of file diff --git a/webmagic-plugin/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml similarity index 61% rename from webmagic-plugin/webmagic-selenium/pom.xml rename to webmagic-selenium/pom.xml index 0da4504..814b7b3 100644 --- a/webmagic-plugin/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -2,13 +2,13 @@ - us.codecraft - webmagic-plugin - 0.1.0 + webmagic + 0.2.0 4.0.0 + webmagic-selenium @@ -17,7 +17,15 @@ selenium-java 2.33.0
+ + us.codecraft + webmagic-core + ${project.version} + + + junit + junit +
- \ No newline at end of file diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java similarity index 96% rename from webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java rename to webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 1b689d4..0fa0eea 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.selenium.downloader; +package us.codecraft.webmagic.downloader.selenium; import org.apache.log4j.Logger; import org.openqa.selenium.By; @@ -21,8 +21,8 @@ import java.util.Map; * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
* 需要下载Selenium driver支持。
* - * @author yihua.huang@dianping.com
- * @date: 13-7-26
+ * @author code4crafter@gmail.com
+ * Date: 13-7-26
* Time: 下午1:37
*/ public class SeleniumDownloader implements Downloader, Destroyable { diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java similarity index 95% rename from webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java rename to webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index faed8d6..71ba290 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.selenium.downloader; +package us.codecraft.webmagic.downloader.selenium; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; @@ -11,8 +11,8 @@ import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; /** - * @author yihua.huang@dianping.com
- * @date: 13-7-26
+ * @author code4crafter@gmail.com
+ * Date: 13-7-26
* Time: 下午1:41
*/ class WebDriverPool { diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java similarity index 92% rename from webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java rename to webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java index a403b91..b7bcd80 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.selenium; +package us.codecraft.webmagic.downloader; import org.junit.Ignore; import org.junit.Test; @@ -13,8 +13,8 @@ import java.util.HashMap; import java.util.Map; /** - * @author yihua.huang@dianping.com
- * @date: 13-7-26
+ * @author code4crafter@gmail.com
+ * Date: 13-7-26
* Time: 下午12:27
*/ public class SeleniumTest { diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java similarity index 93% rename from webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java rename to webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java index 9683083..2b8c247 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloaderTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.selenium.downloader; +package us.codecraft.webmagic.downloader.selenium; import org.junit.Ignore; import org.junit.Test; @@ -8,8 +8,8 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** - * @author yihua.huang@dianping.com
- * @date: 13-7-26
+ * @author code4crafter@gmail.com
+ * Date: 13-7-26
* Time: 下午2:46
*/ public class SeleniumDownloaderTest { diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java similarity index 86% rename from webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java rename to webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java index 1efc69b..a711a19 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java @@ -1,12 +1,12 @@ -package us.codecraft.webmagic.selenium.downloader; +package us.codecraft.webmagic.downloader.selenium; import org.junit.Ignore; import org.junit.Test; import org.openqa.selenium.WebDriver; /** - * @author yihua.huang@dianping.com
- * @date: 13-7-26
+ * @author code4crafter@gmail.com
+ * Date: 13-7-26
* Time: 下午2:12
*/ public class WebDriverPoolTest { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java similarity index 84% rename from webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java rename to webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java index d8c5f05..1696a3f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -3,16 +3,15 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; -import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader; /** * 花瓣网抽取器。
* 使用Selenium做页面动态渲染。
- * @author yihua.huang@dianping.com
- * @date: 13-7-26
+ * @author code4crafter@gmail.com
+ * Date: 13-7-26
* Time: 下午4:08
*/ public class HuabanProcessor implements PageProcessor { @@ -39,7 +38,6 @@ public class HuabanProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new HuabanProcessor()).thread(5) - .scheduler(new RedisScheduler("localhost")) .pipeline(new FilePipeline("/data/webmagic/test/")) .downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver")) .runAsync();