Merge branch 'annotation'
commit
23f6bb8d32
27
pom.xml
27
pom.xml
|
@ -3,15 +3,15 @@
|
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.1.0</version>
|
||||
<version>0.2.0</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>pom</packaging>
|
||||
<artifactId>webmagic</artifactId>
|
||||
|
||||
<modules>
|
||||
<module>webmagic-core</module>
|
||||
<module>webmagic-plugin/</module>
|
||||
<module>webmagic-samples/</module>
|
||||
<modules>
|
||||
<module>webmagic-core</module>
|
||||
<module>webmagic-extension/</module>
|
||||
<module>webmagic-samples/</module>
|
||||
</modules>
|
||||
|
||||
<dependencyManagement>
|
||||
|
@ -27,6 +27,11 @@
|
|||
<artifactId>httpclient</artifactId>
|
||||
<version>4.2.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
<version>9.5.1-1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
|
@ -45,7 +50,7 @@
|
|||
<dependency>
|
||||
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||
<artifactId>htmlcleaner</artifactId>
|
||||
<version>2.4</version>
|
||||
<version>2.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
|
@ -75,6 +80,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<version>2.8</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-dependencies</id>
|
||||
|
@ -94,6 +100,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<version>2.6</version>
|
||||
<configuration>
|
||||
<encoding>UTF-8</encoding>
|
||||
</configuration>
|
||||
|
@ -101,6 +108,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>2.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-sources</id>
|
||||
|
@ -113,6 +121,10 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>2.9.1</version>
|
||||
<configuration>
|
||||
<encoding>UTF-8</encoding>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-javadocs</id>
|
||||
|
@ -125,11 +137,10 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-release-plugin</artifactId>
|
||||
<version>2.0-beta-7</version>
|
||||
<version>2.4.1</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,360 @@
|
|||
webmagic使用手册
|
||||
------
|
||||
>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。
|
||||
|
||||
>web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录、抓取静态资源等。
|
||||
|
||||
>作者黄亿华([code4crafter@gmail.com](code4crafter@gmail.com))目前就职于大众点评,曾经在前公司进行过一年的垂直爬虫的开发,webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。
|
||||
|
||||
>webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者:
|
||||
|
||||
>python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
|
||||
|
||||
>Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitcafe.com/laiweiwei/Spiderman)
|
||||
|
||||
>webmagic遵循[Apache 2.0协议](http://www.apache.org/licenses/LICENSE-2.0.html),你可以自由进行使用和修改。有使用不便或者问题,欢迎在github[提交issue](https://github.com/code4craft/webmagic/issues),或者在[oschina讨论模块](http://www.oschina.net/question)提问。
|
||||
|
||||
<div style="page-break-after:always"></div>
|
||||
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 使用maven
|
||||
|
||||
webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译:
|
||||
|
||||
git clone https://github.com/code4craft/webmagic.git
|
||||
mvn clean install
|
||||
|
||||
安装后,在项目中添加对应的依赖即可使用webmagic:
|
||||
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.2.0</version>
|
||||
</dependency>
|
||||
|
||||
#### 项目结构
|
||||
|
||||
webmagic主要包括两个包:
|
||||
|
||||
* **webmagic-core**
|
||||
|
||||
webmagic核心部分,只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
|
||||
|
||||
* **webmagic-extension**
|
||||
|
||||
webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。
|
||||
|
||||
webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来:
|
||||
|
||||
* **webmagic-saxon**
|
||||
|
||||
webmagic与Saxon结合的模块。Saxon是一个XPath、XSLT的解析工具,webmagic依赖Saxon来进行XPath2.0语法解析支持。
|
||||
|
||||
* **webmagic-selenium**
|
||||
|
||||
webmagic与Selenium结合的模块。Selenium是一个模拟浏览器进行页面渲染的工具,webmagic依赖Selenium进行动态页面的抓取。
|
||||
|
||||
在项目中,你可以根据需要依赖不同的包。
|
||||
|
||||
### 不使用maven
|
||||
|
||||
不使用maven的用户,可以下载这个二进制打包版本(感谢[oschina](http://www.oschina.net/)):
|
||||
|
||||
git clone http://git.oschina.net/flashsword20/webmagic-bin.git
|
||||
|
||||
在**bin/lib**目录下,有项目依赖的所有jar包,直接在IDE里import即可。
|
||||
|
||||
### 第一个爬虫
|
||||
|
||||
#### 定制PageProcessor
|
||||
|
||||
PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码:
|
||||
|
||||
public class OschinaBlogPageProcesser implements PageProcessor {
|
||||
|
||||
private Site site = Site.me().setDomain("my.oschina.net")
|
||||
.addStartUrl("http://my.oschina.net/flashsword/blog");
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
||||
page.addTargetRequests(links);
|
||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
|
||||
page.putField("content", page.getHtml().$("div.content").toString());
|
||||
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new OschinaBlogPageProcesser())
|
||||
.pipeline(new ConsolePipeline()).run();
|
||||
}
|
||||
}
|
||||
|
||||
这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。
|
||||
|
||||
Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这里ConsolePipeline表示结果输出到控制台。
|
||||
|
||||
执行这个main方法,即可在控制台看到抓取结果。webmagic默认有3秒抓取间隔,请耐心等待。
|
||||
|
||||
#### 使用注解
|
||||
|
||||
webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同:
|
||||
|
||||
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
||||
public class OschinaBlog {
|
||||
|
||||
@ExtractBy("//title")
|
||||
private String title;
|
||||
|
||||
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
|
||||
private String content;
|
||||
|
||||
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
||||
private List<String> tags;
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(
|
||||
Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"),
|
||||
new ConsolePageModelPipeline(), OschinaBlog.class).run();
|
||||
}
|
||||
}
|
||||
|
||||
这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。
|
||||
|
||||
注解的详细使用方式见后文中得webmagic-extension注解模块。
|
||||
|
||||
<div style="page-break-after:always"></div>
|
||||
|
||||
|
||||
## webmagic-core
|
||||
|
||||
webmagic-core是爬虫的核心框架,只包括一个爬虫各功能模块的核心功能。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
|
||||
|
||||
此节部分内容摘自作者的博文
|
||||
[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796)。
|
||||
|
||||
### webmagic-core的模块划分
|
||||
|
||||
webmagic-core参考了scrapy的模块划分,分为Spider(整个爬虫的调度框架)、Downloader(页面下载)、PageProcessor(链接提取和页面分析)、Scheduler(URL管理)、Pipeline(离线分析和持久化)几部分。只不过scrapy通过middleware实现扩展,而webmagic则通过定义这几个接口,并将其不同的实现注入主框架类Spider来实现扩展。
|
||||
|
||||

|
||||
<div style="page-break-after:always"></div>
|
||||
|
||||
#### Spider类(核心调度)
|
||||
|
||||
**Spider**是爬虫的入口类,Spider的接口调用采用了链式的API设计,其他功能全部通过接口注入Spider实现,下面是启动一个比较复杂的Spider的例子。
|
||||
|
||||
Spider.create(sinaBlogProcessor)
|
||||
.scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/"))
|
||||
.pipeline(new FilePipeline())
|
||||
.thread(10).run();
|
||||
|
||||
|
||||
Spider的核心处理流程非常简单,代码如下:
|
||||
|
||||
<!-- lang: java -->
|
||||
private void processRequest(Request request) {
|
||||
Page page = downloader.download(request, this);
|
||||
if (page == null) {
|
||||
sleep(site.getSleepTime());
|
||||
return;
|
||||
}
|
||||
pageProcessor.process(page);
|
||||
addRequest(page);
|
||||
for (Pipeline pipeline : pipelines) {
|
||||
pipeline.process(page, this);
|
||||
}
|
||||
sleep(site.getSleepTime());
|
||||
}
|
||||
|
||||
#### PageProcessor(页面分析及链接抽取)
|
||||
|
||||
页面分析是垂直爬虫中需要定制的部分。在webmagic-core里,通过实现**PageProcessor**接口来实现定制爬虫。PageProcessor有两个核心方法:public void process(Page page)和public Site getSite() 。
|
||||
|
||||
* public void process(Page page)
|
||||
|
||||
通过对**Page**对象的操作,实现爬虫逻辑。Page对象包括两个最重要的方法:addTargetRequests()可以添加URL到待抓取队列,put()可以将结果保存供后续处理。
|
||||
Page的数据可以通过Page.getHtml()和Page.getUrl()获取。
|
||||
|
||||
* public Site getSite()
|
||||
|
||||
**Site**对象定义了爬虫的域名、起始地址、抓取间隔、编码等信息。
|
||||
|
||||
**Selector**是webmagic为了简化页面抽取开发的独立模块,是webmagic-core的主要着力点。这里整合了CSS Selector、XPath和正则表达式,并可以进行链式的抽取。
|
||||
|
||||
<!-- lang: java -->
|
||||
//content是用别的爬虫工具抽取到的正文
|
||||
List<String> links = page.getHtml()
|
||||
.$("div.title") //css 选择,Java里虽然很少有$符号出现,不过貌似$作为方法名是合法的
|
||||
.xpath("//@href") //提取链接
|
||||
.regex(".*blog.*") //正则匹配过滤
|
||||
.all(); //转换为string列表
|
||||
|
||||
webmagic包括一个对于页面正文的自动抽取的类**SmartContentSelector**。相信用过Evernote Clearly都会对其自动抽取正文的技术印象深刻。这个技术又叫**Readability**。当然webmagic对Readability的实现还比较粗略,但是仍有一些学习价值。
|
||||
|
||||
基于Saxon,webmagic提供了XPath2.0语法的支持。XPath2.0语法支持内部函数、逻辑控制等,是一门完整的语言,如果你熟悉XPath2.0语法,倒是不妨一试(需要引入**webmagic-saxon**包)。
|
||||
|
||||
**webmagic-samples**包里有一些为某个站点定制的PageProcessor,供学习之用。
|
||||
|
||||
#### Downloader(页面下载)
|
||||
|
||||
**Downloader**是webmagic中下载页面的接口,主要方法:
|
||||
|
||||
* public Page download(Request request, Task task)
|
||||
|
||||
**Request**对象封装了待抓取的URL及其他信息,而Page则包含了页面下载后的Html及其他信息。Task是一个包装了任务对应的Site信息的抽象接口。
|
||||
|
||||
* public void setThread(int thread)
|
||||
|
||||
因为Downloader一般会涉及连接池等功能,而这些功能与多线程密切相关,所以定义了此方法。
|
||||
|
||||
目前有几个Downloader的实现:
|
||||
|
||||
* HttpClientDownloader
|
||||
|
||||
集成了**Apache HttpClient**的Downloader。Apache HttpClient(4.0后整合到HttpCompenent项目中)是强大的Java http下载器,它支持自定义HTTP头(对于爬虫比较有用的就是User-agent、cookie等)、自动redirect、连接复用、cookie保留、设置代理等诸多强大的功能。
|
||||
|
||||
* SeleniumDownloader
|
||||
|
||||
对于一些Javascript动态加载的网页,仅仅使用http模拟下载工具,并不能取到页面的内容。这方面的思路有两种:一种是抽丝剥茧,分析js的逻辑,再用爬虫去重现它;另一种就是:内置一个浏览器,直接获取最后加载完的页面。**webmagic-selenium**包中整合了Selenium到SeleniumDownloader,可以直接进行动态加载页面的抓取。
|
||||
|
||||
#### Scheduler(URL管理)
|
||||
|
||||
**Scheduler**是webmagic的管理模块,通过实现Scheduler可以定制自己的URL管理器。Scheduler包括两个主要方法:
|
||||
|
||||
* public void push(Request request,Task task)
|
||||
|
||||
将待抓取URL加入Scheduler。Request对象是对URL的一个封装,还包括优先级、以及一个供存储数据的Map。Task仍然用于区分不同任务,在多个任务公用一个Scheduler时可以此进行区分。
|
||||
|
||||
* public Request poll(Task task)
|
||||
|
||||
从Scheduler里取出一条请求,并进行后续执行。
|
||||
|
||||
webmagic目前有三个Scheduler的实现:
|
||||
|
||||
* QueueScheduler
|
||||
|
||||
一个简单的内存队列,速度较快,并且是线程安全的。
|
||||
|
||||
* FileCacheQueueScheduler
|
||||
|
||||
使用文件保存队列,它可以用于耗时较长的下载任务,在任务中途停止后(手动停止或者程序崩溃),下次执行仍然从中止的URL开始继续爬取。
|
||||
|
||||
* RedisScheduler
|
||||
|
||||
使用redis存储URL队列。通过使用同一台redis服务器存储URL,webmagic可以很容易的在多机部署,从而达到分布式爬虫的效果。
|
||||
|
||||
#### Pipeline(后续处理和持久化)
|
||||
|
||||
**Pipeline**是最终抽取结果进行输出和持久化的接口。它只包括一个方法:
|
||||
|
||||
* public void process(ResultItems resultItems,Task task)
|
||||
|
||||
**ResultItems**是集成了抽取结果的对象。通过ResultItems.get(key)可以获取抽取结果。Task同样是用于区分不同任务的对象。
|
||||
|
||||
webmagic包括以下几个Pipeline的实现:
|
||||
|
||||
* ConsolePipeline
|
||||
|
||||
直接输出结果到控制台,测试时使用。
|
||||
|
||||
* FilePipeline
|
||||
|
||||
输出结果到文件,每个URL单独保存到一个页面,以URL的MD5结果作为文件名。通过构造函数`public FilePipeline(String path)`定义存储路径,**以下使用文件持久化的类,多数都使用此方法指定路径**。
|
||||
|
||||
* JsonFilePipeline
|
||||
|
||||
以JSON输出结果到文件(.json后缀),其他与FilePipeline相同。
|
||||
|
||||
webmagic目前不支持持久化到数据库,但是结合其他工具,持久化到数据库也是很容易的。这里不妨看一下[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。因为JFinal目前还不支持maven,所以这段代码并没有放到webmagic-samples里来。
|
||||
|
||||
<div style="page-break-after:always"></div>
|
||||
|
||||
## webmagic-extension
|
||||
|
||||
webmagic-extension是为了开发爬虫更方便而实现的一些功能模块。这些功能完全基于webmagic-core的框架,包括注解形式编写爬虫、分页、分布式等功能。
|
||||
|
||||
### 注解模块
|
||||
|
||||
webmagic-extension包括注解模块。为什么会有注解方式?
|
||||
|
||||
因为PageProcessor的方式灵活、强大,但是没有解决两个问题:
|
||||
|
||||
* 对于一个站点,如果想抓取多种格式的URL,那么必须在PageProcesser中写判断逻辑,代码难以管理。
|
||||
* 抓取结果没有对应Model,并不符合Java程序开发习惯,与一些框架也无法很好整合。
|
||||
|
||||
注解的核心是Model类,本身是一个POJO,这个Model类用于传递、保存页面最终抓取结果数据。注解方式直接将抽取与数据绑定,以便于编写和维护。
|
||||
|
||||
注解方式其实也是通过一个PageProcessor的实现--ModelPageProcessor完成,因此对webmagic-core代码没有任何影响。
|
||||
|
||||
注解部分包括以下内容:
|
||||
|
||||
* #### TargetUrl
|
||||
|
||||
"TargetUrl"表示这个Model对应要抓取的URL,它包含两层意思:符合这个条件的URL会被加入抓取队列;符合这个条件的URL会被这个Model抓取。TargetUrl可以**sourceRegion**指定提取URL的区域(仅支持XPath)。
|
||||
|
||||
TargetUrl使用了正则表达式,匹配 "http://my.oschina.net/flashsword/blog/150039" 格式的URL。webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
|
||||
|
||||
与TargetUrl相似的还有**HelpUrl**,HelpUrl表示:仅仅抓取该URL用作链接提取,并不对它进行内容抽取。例如博客正文页对应TargetUrl,而列表页则对应HelpUrl。
|
||||
|
||||
* #### ExtractBy
|
||||
|
||||
* ##### 用于字段
|
||||
|
||||
"ExtractBy"可用于类以及字段。用于字段时,定义了字段抽取的规则。抽取的规则默认使用[**XPath**](http://www.w3school.com.cn/xpath/),也可以选择使用CSS Selector、正则表达式(通过设置type)。
|
||||
|
||||
ExtractBy还有几个扩展属性。**multi**表示是否抽取列表,当然,设置为multi时,你需要一个List字段去容纳它。**notnull**则表示,此字段不允许为null,若为null则放弃整个对象。
|
||||
|
||||
* ##### 用于类
|
||||
"ExtractBy"用于类时,则限定了字段抽取的区域。用于类时仍支持multi,multi则表示一个页面可以抽取到多个对象。
|
||||
|
||||
* ##### ExtractByRaw & ExtractByUrl
|
||||
|
||||
在类使用"ExtractBy"修饰后,字段的"ExtractBy"使用的是其抽取的结果,如果仍然想要抽取原HTML,可以使用"ExtractByRaw"。与此类似的还有"ExtractByUrl",表示从URL中抽取信息。ExtractByUrl只支持正则表达式。
|
||||
|
||||
* ##### ExtractBy2 ExtractBy3
|
||||
|
||||
"ExtractBy"、"ExtractByRaw"支持链式抽取,通过增加注解"ExtractBy2"、"ExtractBy3"实现。
|
||||
|
||||
* #### AfterExtractor
|
||||
|
||||
AfterExtractor接口是对注解方式抽取能力不足的补充。实现AfterExtractor接口后,会在**使用注解方式填充完字段后**调用**afterProcess()**方法,在这个方法中可以直接访问已抽取的字段、补充需要抽取的字段,甚至做一些简单的输出和持久化操作(并不是很建议这么做)。这部分可以参考[webmagic结合JFinal持久化到数据库的一段代码](http://www.oschina.net/code/snippet_190591_23456)。
|
||||
|
||||
* #### OOSpider
|
||||
OOSpider是注解式爬虫的入口,这里调用**create()**方法将OschinaBlog这个类加入到爬虫的抽取中,这里是可以传入多个类的,例如:
|
||||
|
||||
OOSpider.create(
|
||||
Site.me().addStartUrl("http://www.oschina.net"),
|
||||
new ConsolePageModelPipeline(),
|
||||
OschinaBlog.clas,OschinaAnswer.class).run();
|
||||
|
||||
OOSpider会根据TargetUrl调用不同的Model进行解析。
|
||||
|
||||
* #### PageModelPipeline
|
||||
可以通过定义PageModelPipeline来选择结果输出方式。这里new ConsolePageModelPipeline()是PageModelPipeline的一个实现,会将结果输出到控制台。
|
||||
|
||||
* #### 分页
|
||||
|
||||
处理单项数据分页(例如单条新闻多个页面)是爬虫一个比较头疼的问题。webmagic目前对于分页的解决方案是:在注解模式下,Model通过实现**PagedModel**接口,并引入PagedPipeline作为第一个Pipeline来实现。具体可以参考webmagic-samples中抓取网易新闻的代码:**us.codecraft.webmagic.model.samples.News163**。
|
||||
|
||||
关于分页,这里有一篇对于webmagic分页实现的详细说明的文章[关于爬虫实现分页的一些思考](http://my.oschina.net/flashsword/blog/150039)。
|
||||
目前分页功能还没有分布式实现,如果实现RedisScheduler进行分布式爬取,请不要使用分页功能。
|
||||
|
||||
### 分布式
|
||||
|
||||
webmagic-extension中,通过redis来管理URL,达到分布式的效果。但是对于分布式爬虫,仅仅程序能够分布式运行,还满足不了大规模抓取的需要,webmagic可能后期会加入一些任务管理和监控的功能,也欢迎各位用户为webmagic提交代码,做出贡献。
|
||||
|
||||
|
|
@ -1,3 +1,3 @@
|
|||
webmagic-core
|
||||
-------
|
||||
webmagic核心部分。
|
||||
webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<version>0.2.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* <pre>
|
||||
*Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
|
||||
* Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
|
||||
*
|
||||
* 主要方法:
|
||||
* {@link #getUrl()} 获取页面的Url
|
||||
|
@ -19,6 +19,7 @@ import java.util.List;
|
|||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
|
||||
*
|
||||
* </pre>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
*/
|
||||
public class Page {
|
||||
|
@ -36,9 +37,16 @@ public class Page {
|
|||
public Page() {
|
||||
}
|
||||
|
||||
public Page setSkip(boolean skip) {
|
||||
resultItems.setSkip(skip);
|
||||
return this;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存抽取的结果
|
||||
* @param key 结果的key
|
||||
*
|
||||
* @param key 结果的key
|
||||
* @param field 结果的value
|
||||
*/
|
||||
public void putField(String key, Object field) {
|
||||
|
@ -47,6 +55,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 获取页面的html内容
|
||||
*
|
||||
* @return html 页面的html内容
|
||||
*/
|
||||
public Selectable getHtml() {
|
||||
|
@ -63,6 +72,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 添加待抓取的链接
|
||||
*
|
||||
* @param requests 待抓取的链接
|
||||
*/
|
||||
public void addTargetRequests(List<String> requests) {
|
||||
|
@ -79,6 +89,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 添加待抓取的链接
|
||||
*
|
||||
* @param requestString 待抓取的链接
|
||||
*/
|
||||
public void addTargetRequest(String requestString) {
|
||||
|
@ -93,6 +104,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 添加待抓取的页面,在需要传递附加信息时使用
|
||||
*
|
||||
* @param request 待抓取的页面
|
||||
*/
|
||||
public void addTargetRequest(Request request) {
|
||||
|
@ -103,6 +115,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 获取页面的Url
|
||||
*
|
||||
* @return url 当前页面的url,可用于抽取
|
||||
*/
|
||||
public Selectable getUrl() {
|
||||
|
@ -111,6 +124,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 设置url
|
||||
*
|
||||
* @param url
|
||||
*/
|
||||
public void setUrl(Selectable url) {
|
||||
|
@ -119,6 +133,7 @@ public class Page {
|
|||
|
||||
/**
|
||||
* 获取抓取请求
|
||||
*
|
||||
* @return request 抓取请求
|
||||
*/
|
||||
public Request getRequest() {
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Request对象封装了待抓取的url信息。<br/>
|
||||
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。<br/>
|
||||
|
@ -18,40 +22,95 @@ package us.codecraft.webmagic;
|
|||
* String linktext = (String)page.getRequest().getExtra()[0];
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午11:37
|
||||
* Date: 13-4-21
|
||||
* Time: 上午11:37
|
||||
*/
|
||||
public class Request {
|
||||
public class Request implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 2062192774891352043L;
|
||||
|
||||
private String url;
|
||||
|
||||
private Object[] extra;
|
||||
|
||||
/**
|
||||
* 构建一个request对象
|
||||
* @param url 必须参数,待抓取的url
|
||||
* @param extra 额外参数,可以保存一些需要的上下文信息
|
||||
* 额外参数,可以保存一些需要的上下文信息
|
||||
*/
|
||||
public Request(String url, Object... extra) {
|
||||
this.url = url;
|
||||
this.extra = extra;
|
||||
private Map<String, Object> extras;
|
||||
|
||||
private double priority;
|
||||
|
||||
public Request() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取预存的对象
|
||||
* @return object[] 预存的对象数组
|
||||
* 构建一个request对象
|
||||
*
|
||||
* @param url 必须参数,待抓取的url
|
||||
*/
|
||||
public Object[] getExtra() {
|
||||
return extra;
|
||||
public Request(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public double getPriority() {
|
||||
return priority;
|
||||
}
|
||||
|
||||
public Request setPriority(double priority) {
|
||||
this.priority = priority;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Object getExtra(String key) {
|
||||
if (extras == null) {
|
||||
return null;
|
||||
}
|
||||
return extras.get(key);
|
||||
}
|
||||
|
||||
public Request putExtra(String key, Object value) {
|
||||
if (extras == null) {
|
||||
extras = new HashMap<String, Object>();
|
||||
}
|
||||
extras.put(key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取待抓取的url
|
||||
*
|
||||
* @return url 待抓取的url
|
||||
*/
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
Request request = (Request) o;
|
||||
|
||||
if (!url.equals(request.url)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public Map<String, Object> getExtras() {
|
||||
return extras;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return url.hashCode();
|
||||
}
|
||||
|
||||
public void setExtras(Map<String, Object> extras) {
|
||||
this.extras = extras;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@ import java.util.Map;
|
|||
|
||||
/**
|
||||
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-25 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-25 <br>
|
||||
* Time: 下午12:20 <br>
|
||||
*/
|
||||
public class ResultItems {
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
@ -90,6 +92,11 @@ public class Site {
|
|||
* @return 已设置的domain
|
||||
*/
|
||||
public String getDomain() {
|
||||
if (domain == null) {
|
||||
if (startUrls.size() > 0) {
|
||||
domain = UrlUtils.getDomain(startUrls.get(0));
|
||||
}
|
||||
}
|
||||
return domain;
|
||||
}
|
||||
|
||||
|
@ -150,6 +157,7 @@ public class Site {
|
|||
|
||||
/**
|
||||
* 获取初始页面的地址列表
|
||||
*
|
||||
* @return 初始页面的地址列表
|
||||
*/
|
||||
public List<String> getStartUrls() {
|
||||
|
@ -158,6 +166,7 @@ public class Site {
|
|||
|
||||
/**
|
||||
* 增加初始页面的地址,可反复调用此方法增加多个初始地址。
|
||||
*
|
||||
* @param startUrl 初始页面的地址
|
||||
* @return this
|
||||
*/
|
||||
|
@ -179,6 +188,7 @@ public class Site {
|
|||
|
||||
/**
|
||||
* 获取两次抓取之间的间隔
|
||||
*
|
||||
* @return 两次抓取之间的间隔,单位毫秒
|
||||
*/
|
||||
public int getSleepTime() {
|
||||
|
@ -187,6 +197,7 @@ public class Site {
|
|||
|
||||
/**
|
||||
* 获取重新下载的次数,默认为0
|
||||
*
|
||||
* @return 重新下载的次数
|
||||
*/
|
||||
public int getRetryTimes() {
|
||||
|
@ -195,6 +206,7 @@ public class Site {
|
|||
|
||||
/**
|
||||
* 设置获取重新下载的次数,默认为0
|
||||
*
|
||||
* @return this
|
||||
*/
|
||||
public Site setRetryTimes(int retryTimes) {
|
||||
|
@ -219,7 +231,7 @@ public class Site {
|
|||
return true;
|
||||
}
|
||||
|
||||
public Task toTask(){
|
||||
public Task toTask() {
|
||||
return new Task() {
|
||||
@Override
|
||||
public String getUUID() {
|
||||
|
|
|
@ -8,8 +8,8 @@ import us.codecraft.webmagic.downloader.HttpClientDownloader;
|
|||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.schedular.QueueScheduler;
|
||||
import us.codecraft.webmagic.schedular.Scheduler;
|
||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||
import us.codecraft.webmagic.utils.ThreadUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -228,8 +228,10 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
pageProcessor.process(page);
|
||||
addRequest(page);
|
||||
for (Pipeline pipeline : pipelines) {
|
||||
pipeline.process(page.getResultItems(), this);
|
||||
if (!page.getResultItems().isSkip()){
|
||||
for (Pipeline pipeline : pipelines) {
|
||||
pipeline.process(page.getResultItems(), this);
|
||||
}
|
||||
}
|
||||
sleep(site.getSleepTime());
|
||||
}
|
||||
|
@ -283,6 +285,11 @@ public class Spider implements Runnable, Task {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Spider clearPipeline(){
|
||||
pipelines=new ArrayList<Pipeline>();
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getUUID() {
|
||||
if (uuid != null) {
|
||||
|
|
|
@ -2,8 +2,8 @@ package us.codecraft.webmagic.downloader;
|
|||
|
||||
/**
|
||||
* 比较占用资源的服务可以实现该接口,Spider会在结束时调用destroy()释放资源。<br>
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-26 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 下午3:10 <br>
|
||||
*/
|
||||
public interface Destroyable {
|
||||
|
|
|
@ -7,29 +7,18 @@ import java.util.Map;
|
|||
|
||||
/**
|
||||
* 命令行输出抽取结果。可用于测试。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:45
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:45
|
||||
*/
|
||||
public class ConsolePipeline implements Pipeline{
|
||||
public class ConsolePipeline implements Pipeline {
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems,Task task) {
|
||||
if (resultItems.isSkip()){
|
||||
return;
|
||||
}
|
||||
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
System.out.println("get page: " + resultItems.getRequest().getUrl());
|
||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||
if (entry.getValue() instanceof Iterable) {
|
||||
Iterable value = (Iterable) entry.getValue();
|
||||
System.out.println(entry.getKey() + ":");
|
||||
for (Object o : value) {
|
||||
System.out.println(o);
|
||||
}
|
||||
} else {
|
||||
System.out.println(entry.getKey() + ":\t" + entry.getValue());
|
||||
}
|
||||
System.out.println(entry.getKey()+":\t"+entry.getValue());
|
||||
System.out.println(entry.getKey() + ":\t" + entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,12 +20,12 @@ import java.util.Map;
|
|||
*/
|
||||
public class FilePipeline implements Pipeline {
|
||||
|
||||
private String path = "/data/temp/webmagic/";
|
||||
private String path = "/data/webmagic/";
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
/**
|
||||
* 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/"
|
||||
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
|
||||
*/
|
||||
public FilePipeline() {
|
||||
|
||||
|
@ -37,6 +37,9 @@ public class FilePipeline implements Pipeline {
|
|||
* @param path 文件保存路径
|
||||
*/
|
||||
public FilePipeline(String path) {
|
||||
if (!path.endsWith("/")&&!path.endsWith("\\")){
|
||||
path+="/";
|
||||
}
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
|
@ -47,9 +50,6 @@ public class FilePipeline implements Pipeline {
|
|||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
if (resultItems.isSkip()) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
|
||||
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.schedular;
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Request;
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.schedular;
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
|
@ -0,0 +1,53 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-3 <br>
|
||||
* Time: 下午5:29 <br>
|
||||
*/
|
||||
public class AndSelector implements Selector {
|
||||
|
||||
private List<Selector> selectors = new ArrayList<Selector>();
|
||||
|
||||
public AndSelector(Selector... selectors) {
|
||||
for (Selector selector : selectors) {
|
||||
this.selectors.add(selector);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
for (Selector selector : selectors) {
|
||||
if (text == null) {
|
||||
return null;
|
||||
}
|
||||
text = selector.select(text);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> selectList(String text) {
|
||||
List<String> results = new ArrayList<String>();
|
||||
boolean first = true;
|
||||
for (Selector selector : selectors) {
|
||||
if (first) {
|
||||
results = selector.selectList(text);
|
||||
first = false;
|
||||
} else {
|
||||
List<String> resultsTemp = new ArrayList<String>();
|
||||
for (String result : results) {
|
||||
resultsTemp.addAll(selector.selectList(result));
|
||||
}
|
||||
results = resultsTemp;
|
||||
if (results == null || results.size() == 0) {
|
||||
return results;
|
||||
}
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-3 <br>
|
||||
* Time: 下午5:29 <br>
|
||||
*/
|
||||
public class OrSelector implements Selector {
|
||||
|
||||
private List<Selector> selectors = new ArrayList<Selector>();
|
||||
|
||||
public OrSelector(Selector... selectors) {
|
||||
for (Selector selector : selectors) {
|
||||
this.selectors.add(selector);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
for (Selector selector : selectors) {
|
||||
text = selector.select(text);
|
||||
if (text!=null){
|
||||
return text;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> selectList(String text) {
|
||||
List<String> results = new ArrayList<String>();
|
||||
for (Selector selector : selectors) {
|
||||
List<String> strings = selector.selectList(text);
|
||||
results.addAll(strings);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
}
|
|
@ -8,7 +8,7 @@ import java.util.List;
|
|||
* Date: 13-4-20
|
||||
* Time: 下午8:02
|
||||
*/
|
||||
interface Selector {
|
||||
public interface Selector {
|
||||
|
||||
public String select(String text);
|
||||
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
webmagic-extension
|
||||
-------
|
||||
webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。
|
|
@ -4,24 +4,33 @@
|
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-plugin</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>0.2.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>webmagic-misc</artifactId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.freemarker</groupId>
|
||||
<artifactId>freemarker</artifactId>
|
||||
<version>2.3.15</version>
|
||||
<groupId>com.alibaba</groupId>
|
||||
<artifactId>fastjson</artifactId>
|
||||
<version>1.1.35</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>redis.clients</groupId>
|
||||
<artifactId>jedis</artifactId>
|
||||
<version>2.0.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,20 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-4 <br>
|
||||
* Time: 下午5:18 <br>
|
||||
*/
|
||||
public interface PagedModel {
|
||||
|
||||
public String getPageKey();
|
||||
|
||||
public Collection<String> getOtherPages();
|
||||
|
||||
public String getPage();
|
||||
|
||||
public PagedModel combine(PagedModel pagedModel);
|
||||
|
||||
}
|
|
@ -34,6 +34,9 @@ public class FileDownloader implements Downloader {
|
|||
}
|
||||
|
||||
public FileDownloader(String path, Downloader downloaderWhenFileMiss) {
|
||||
if (!path.endsWith("/")&&!path.endsWith("\\")){
|
||||
path+="/";
|
||||
}
|
||||
this.path = path;
|
||||
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
|
||||
/**
|
||||
* 实现这个接口即可在抽取后进行后处理。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-3 <br>
|
||||
* Time: 上午9:42 <br>
|
||||
*/
|
||||
public interface AfterExtractor {
|
||||
|
||||
public void afterProcess(Page page);
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import org.apache.commons.lang3.builder.ToStringBuilder;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-3 <br>
|
||||
* Time: 下午3:41 <br>
|
||||
*/
|
||||
public class ConsolePageModelPipeline implements PageModelPipeline {
|
||||
@Override
|
||||
public void process(Object o, Task task) {
|
||||
System.out.println(ToStringBuilder.reflectionToString(o));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午9:48 <br>
|
||||
*/
|
||||
class Extractor {
|
||||
|
||||
protected Selector selector;
|
||||
|
||||
protected final Source source;
|
||||
|
||||
protected final boolean notNull;
|
||||
|
||||
protected final boolean multi;
|
||||
|
||||
static enum Source {Html, Url, RawHtml}
|
||||
|
||||
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
|
||||
this.selector = selector;
|
||||
this.source = source;
|
||||
this.notNull = notNull;
|
||||
this.multi = multi;
|
||||
}
|
||||
|
||||
Selector getSelector() {
|
||||
return selector;
|
||||
}
|
||||
|
||||
Source getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
boolean isNotNull() {
|
||||
return notNull;
|
||||
}
|
||||
|
||||
boolean isMulti() {
|
||||
return multi;
|
||||
}
|
||||
|
||||
void setSelector(Selector selector) {
|
||||
this.selector = selector;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午9:48 <br>
|
||||
*/
|
||||
class FieldExtractor extends Extractor{
|
||||
|
||||
private final Field field;
|
||||
|
||||
private Method setterMethod;
|
||||
|
||||
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) {
|
||||
super(selector, source, notNull,multi);
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
Field getField() {
|
||||
return field;
|
||||
}
|
||||
|
||||
Selector getSelector() {
|
||||
return selector;
|
||||
}
|
||||
|
||||
Source getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
void setSetterMethod(Method setterMethod) {
|
||||
this.setterMethod = setterMethod;
|
||||
}
|
||||
|
||||
Method getSetterMethod() {
|
||||
return setterMethod;
|
||||
}
|
||||
|
||||
boolean isNotNull() {
|
||||
return notNull;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* 基于PageProcessor的扩展点。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:46 <br>
|
||||
*/
|
||||
class ModelPageProcessor implements PageProcessor {
|
||||
|
||||
private List<PageModelExtractor> pageModelExtractorList = new ArrayList<PageModelExtractor>();
|
||||
|
||||
private Site site;
|
||||
|
||||
private Set<Pattern> targetUrlPatterns = new HashSet<Pattern>();
|
||||
|
||||
public static ModelPageProcessor create(Site site, Class... clazzs) {
|
||||
ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
|
||||
for (Class clazz : clazzs) {
|
||||
modelPageProcessor.addPageModel(clazz);
|
||||
}
|
||||
return modelPageProcessor;
|
||||
}
|
||||
|
||||
|
||||
public ModelPageProcessor addPageModel(Class clazz) {
|
||||
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
|
||||
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
|
||||
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
|
||||
pageModelExtractorList.add(pageModelExtractor);
|
||||
return this;
|
||||
}
|
||||
|
||||
private ModelPageProcessor(Site site) {
|
||||
this.site = site;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
|
||||
extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
|
||||
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
|
||||
Object process = pageModelExtractor.process(page);
|
||||
if (process == null || (process instanceof List && ((List) process).size() == 0)) {
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
postProcessPageModel(pageModelExtractor.getClazz(), process);
|
||||
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
|
||||
}
|
||||
}
|
||||
|
||||
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
|
||||
List<String> links;
|
||||
if (urlRegionSelector == null) {
|
||||
links = page.getHtml().links().all();
|
||||
} else {
|
||||
links = urlRegionSelector.selectList(page.getHtml().toString());
|
||||
}
|
||||
for (String link : links) {
|
||||
for (Pattern targetUrlPattern : urlPatterns) {
|
||||
Matcher matcher = targetUrlPattern.matcher(link);
|
||||
if (matcher.find()) {
|
||||
page.addTargetRequest(new Request(matcher.group(1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void postProcessPageModel(Class clazz, Object object) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
|
||||
import java.lang.annotation.Annotation;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* 基于Pipeline的扩展点,用于实现注解格式的Pipeline。<br>
|
||||
* 与PageModelPipeline是一对多的关系(原谅作者没有更好的名字了)。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-2 <br>
|
||||
* Time: 上午10:47 <br>
|
||||
*/
|
||||
class ModelPipeline implements Pipeline {
|
||||
|
||||
private Map<Class, PageModelPipeline> pageModelPipelines = new ConcurrentHashMap<Class, PageModelPipeline>();
|
||||
|
||||
public ModelPipeline() {
|
||||
}
|
||||
|
||||
public ModelPipeline put(Class clazz, PageModelPipeline pageModelPipeline) {
|
||||
pageModelPipelines.put(clazz, pageModelPipeline);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
|
||||
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
|
||||
if (o != null) {
|
||||
Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
|
||||
if (annotation == null || !((ExtractBy) annotation).multi()) {
|
||||
classPageModelPipelineEntry.getValue().process(o, task);
|
||||
} else {
|
||||
List<Object> list = (List<Object>) o;
|
||||
for (Object o1 : list) {
|
||||
classPageModelPipelineEntry.getValue().process(o1, task);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
|
||||
/**
|
||||
* 基于Model的Spider,封装后的入口类。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-3 <br>
|
||||
* Time: 上午9:51 <br>
|
||||
*/
|
||||
public class OOSpider extends Spider {
|
||||
|
||||
private ModelPageProcessor modelPageProcessor;
|
||||
|
||||
private ModelPipeline modelPipeline;
|
||||
|
||||
protected OOSpider(ModelPageProcessor modelPageProcessor) {
|
||||
super(modelPageProcessor);
|
||||
this.modelPageProcessor = modelPageProcessor;
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建一个爬虫。<br>
|
||||
* @param site
|
||||
* @param pageModelPipeline
|
||||
* @param pageModels
|
||||
*/
|
||||
public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
|
||||
this(ModelPageProcessor.create(site, pageModels));
|
||||
this.modelPipeline = new ModelPipeline();
|
||||
super.pipeline(modelPipeline);
|
||||
if (pageModelPipeline!=null){
|
||||
for (Class pageModel : pageModels) {
|
||||
this.modelPipeline.put(pageModel, pageModelPipeline);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static OOSpider create(Site site, Class... pageModels) {
|
||||
return new OOSpider(site, null, pageModels);
|
||||
}
|
||||
|
||||
public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
|
||||
return new OOSpider(site, pageModelPipeline, pageModels);
|
||||
}
|
||||
|
||||
public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) {
|
||||
for (Class pageModel : pageModels) {
|
||||
modelPageProcessor.addPageModel(pageModel);
|
||||
modelPipeline.put(pageModel, pageModelPipeline);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,355 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.model.annotation.*;
|
||||
import us.codecraft.webmagic.selector.*;
|
||||
|
||||
import java.lang.annotation.Annotation;
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午9:33 <br>
|
||||
*/
|
||||
class PageModelExtractor {
|
||||
|
||||
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
|
||||
|
||||
private Selector targetUrlRegionSelector;
|
||||
|
||||
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
|
||||
|
||||
private Selector helpUrlRegionSelector;
|
||||
|
||||
private Class clazz;
|
||||
|
||||
private List<FieldExtractor> fieldExtractors;
|
||||
|
||||
private Extractor extractor;
|
||||
|
||||
public static PageModelExtractor create(Class clazz) {
|
||||
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
||||
pageModelExtractor.init(clazz);
|
||||
return pageModelExtractor;
|
||||
}
|
||||
|
||||
private void init(Class clazz) {
|
||||
this.clazz = clazz;
|
||||
initClassExtractors();
|
||||
fieldExtractors = new ArrayList<FieldExtractor>();
|
||||
for (Field field : clazz.getDeclaredFields()) {
|
||||
field.setAccessible(true);
|
||||
FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
|
||||
FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field);
|
||||
if (fieldExtractor != null && fieldExtractorTmp != null) {
|
||||
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
|
||||
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
|
||||
fieldExtractor = fieldExtractorTmp;
|
||||
}
|
||||
// ExtractBy2 & ExtractBy3
|
||||
if (fieldExtractor!=null){
|
||||
addAnnotationExtractBy2(fieldExtractor);
|
||||
addAnnotationExtractBy3(fieldExtractor);
|
||||
}
|
||||
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
|
||||
if (fieldExtractor != null && fieldExtractorTmp != null) {
|
||||
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
|
||||
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
|
||||
fieldExtractor = fieldExtractorTmp;
|
||||
}
|
||||
if (fieldExtractor != null) {
|
||||
if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be string");
|
||||
} else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
|
||||
throw new IllegalStateException("Field " + field.getName() + " must be list");
|
||||
}
|
||||
fieldExtractors.add(fieldExtractor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
|
||||
FieldExtractor fieldExtractor = null;
|
||||
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
|
||||
if (extractByUrl != null) {
|
||||
String regexPattern = extractByUrl.value();
|
||||
if (regexPattern.trim().equals("")) {
|
||||
regexPattern = ".*";
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
}
|
||||
}
|
||||
return fieldExtractor;
|
||||
}
|
||||
|
||||
private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
|
||||
FieldExtractor fieldExtractor = null;
|
||||
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
|
||||
if (extractBy != null) {
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
}
|
||||
}
|
||||
return fieldExtractor;
|
||||
}
|
||||
|
||||
private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
|
||||
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
|
||||
if (extractBy != null) {
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
|
||||
}
|
||||
}
|
||||
|
||||
private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
|
||||
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
|
||||
if (extractBy != null) {
|
||||
String value = extractBy.value();
|
||||
Selector selector;
|
||||
switch (extractBy.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
|
||||
}
|
||||
}
|
||||
|
||||
private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) {
|
||||
FieldExtractor fieldExtractor = null;
|
||||
ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
|
||||
if (extractByRaw != null) {
|
||||
String value = extractByRaw.value();
|
||||
Selector selector;
|
||||
switch (extractByRaw.type()) {
|
||||
case Css:
|
||||
selector = new CssSelector(value);
|
||||
break;
|
||||
case Regex:
|
||||
selector = new RegexSelector(value);
|
||||
break;
|
||||
case XPath:
|
||||
selector = new XpathSelector(value);
|
||||
break;
|
||||
default:
|
||||
selector = new XpathSelector(value);
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
}
|
||||
}
|
||||
return fieldExtractor;
|
||||
}
|
||||
|
||||
public static Method getSetterMethod(Class clazz, Field field) {
|
||||
String name = "set" + StringUtils.capitalize(field.getName());
|
||||
try {
|
||||
Method declaredMethod = clazz.getDeclaredMethod(name, field.getType());
|
||||
declaredMethod.setAccessible(true);
|
||||
return declaredMethod;
|
||||
} catch (NoSuchMethodException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void initClassExtractors() {
|
||||
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
|
||||
if (annotation == null) {
|
||||
targetUrlPatterns.add(Pattern.compile(".*"));
|
||||
} else {
|
||||
TargetUrl targetUrl = (TargetUrl) annotation;
|
||||
String[] value = targetUrl.value();
|
||||
for (String s : value) {
|
||||
targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
|
||||
}
|
||||
if (!targetUrl.sourceRegion().equals("")) {
|
||||
targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion());
|
||||
}
|
||||
}
|
||||
annotation = clazz.getAnnotation(HelpUrl.class);
|
||||
if (annotation != null) {
|
||||
HelpUrl helpUrl = (HelpUrl) annotation;
|
||||
String[] value = helpUrl.value();
|
||||
for (String s : value) {
|
||||
helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
|
||||
}
|
||||
if (!helpUrl.sourceRegion().equals("")) {
|
||||
helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion());
|
||||
}
|
||||
}
|
||||
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||
if (annotation != null) {
|
||||
ExtractBy extractBy = (ExtractBy) annotation;
|
||||
extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||
}
|
||||
}
|
||||
|
||||
public Object process(Page page) {
|
||||
boolean matched = false;
|
||||
for (Pattern targetPattern : targetUrlPatterns) {
|
||||
if (targetPattern.matcher(page.getUrl().toString()).matches()) {
|
||||
matched = true;
|
||||
}
|
||||
}
|
||||
if (!matched) {
|
||||
return null;
|
||||
}
|
||||
if (extractor == null) {
|
||||
return processSingle(page, page.getHtml().toString());
|
||||
} else {
|
||||
if (extractor.multi) {
|
||||
List<Object> os = new ArrayList<Object>();
|
||||
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
|
||||
for (String s : list) {
|
||||
Object o = processSingle(page, s);
|
||||
if (o != null) {
|
||||
os.add(o);
|
||||
}
|
||||
}
|
||||
return os;
|
||||
} else {
|
||||
String select = extractor.getSelector().select(page.getHtml().toString());
|
||||
Object o = processSingle(page, select);
|
||||
return o;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Object processSingle(Page page, String html) {
|
||||
Object o = null;
|
||||
try {
|
||||
o = clazz.newInstance();
|
||||
for (FieldExtractor fieldExtractor : fieldExtractors) {
|
||||
if (fieldExtractor.isMulti()) {
|
||||
List<String> value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
|
||||
break;
|
||||
case Html:
|
||||
value = fieldExtractor.getSelector().selectList(html);
|
||||
break;
|
||||
case Url:
|
||||
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||
break;
|
||||
default:
|
||||
value = fieldExtractor.getSelector().selectList(html);
|
||||
}
|
||||
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
|
||||
return null;
|
||||
}
|
||||
setField(o, fieldExtractor, value);
|
||||
} else {
|
||||
String value;
|
||||
switch (fieldExtractor.getSource()) {
|
||||
case RawHtml:
|
||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
||||
break;
|
||||
case Html:
|
||||
value = fieldExtractor.getSelector().select(html);
|
||||
break;
|
||||
case Url:
|
||||
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||
break;
|
||||
default:
|
||||
value = fieldExtractor.getSelector().select(html);
|
||||
}
|
||||
if (value == null && fieldExtractor.isNotNull()) {
|
||||
return null;
|
||||
}
|
||||
setField(o, fieldExtractor, value);
|
||||
}
|
||||
}
|
||||
if (AfterExtractor.class.isAssignableFrom(clazz)) {
|
||||
((AfterExtractor) o).afterProcess(page);
|
||||
}
|
||||
} catch (InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IllegalAccessException e) {
|
||||
e.printStackTrace();
|
||||
} catch (InvocationTargetException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return o;
|
||||
}
|
||||
|
||||
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||
if (fieldExtractor.getSetterMethod() != null) {
|
||||
fieldExtractor.getSetterMethod().invoke(o, value);
|
||||
}
|
||||
fieldExtractor.getField().set(o, value);
|
||||
}
|
||||
|
||||
Class getClazz() {
|
||||
return clazz;
|
||||
}
|
||||
|
||||
List<Pattern> getTargetUrlPatterns() {
|
||||
return targetUrlPatterns;
|
||||
}
|
||||
|
||||
List<Pattern> getHelpUrlPatterns() {
|
||||
return helpUrlPatterns;
|
||||
}
|
||||
|
||||
Selector getTargetUrlRegionSelector() {
|
||||
return targetUrlRegionSelector;
|
||||
}
|
||||
|
||||
Selector getHelpUrlRegionSelector() {
|
||||
return helpUrlRegionSelector;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package us.codecraft.webmagic.model;
|
||||
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-3 <br>
|
||||
* Time: 上午9:34 <br>
|
||||
*/
|
||||
public interface PageModelPipeline<T> {
|
||||
|
||||
public void process(T t, Task task);
|
||||
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义类或者字段的抽取规则。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD, ElementType.TYPE})
|
||||
public @interface ExtractBy {
|
||||
|
||||
/**
|
||||
* 抽取规则
|
||||
*
|
||||
* @return 抽取规则
|
||||
*/
|
||||
String value();
|
||||
|
||||
public enum Type {XPath, Regex, Css}
|
||||
|
||||
/**
|
||||
* 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath
|
||||
*
|
||||
* @return 抽取规则类型
|
||||
*/
|
||||
Type type() default Type.XPath;
|
||||
|
||||
/**
|
||||
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
|
||||
*
|
||||
* @return 是否是不能为空的关键字段
|
||||
*/
|
||||
boolean notNull() default false;
|
||||
|
||||
/**
|
||||
* 是否抽取多个结果<br>
|
||||
* 用于字段时,需要List<String>来盛放结果<br>
|
||||
* 用于类时,表示单页抽取多个对象<br>
|
||||
*
|
||||
* @return 是否抽取多个结果
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface ExtractBy2 {
|
||||
|
||||
String value();
|
||||
|
||||
public enum Type {XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath;
|
||||
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface ExtractBy3 {
|
||||
|
||||
String value();
|
||||
|
||||
public enum Type { XPath, Regex, Css}
|
||||
|
||||
Type type() default Type.XPath;
|
||||
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD, ElementType.TYPE})
|
||||
public @interface ExtractByRaw {
|
||||
|
||||
/**
|
||||
* 抽取规则
|
||||
*
|
||||
* @return 抽取规则
|
||||
*/
|
||||
String value();
|
||||
|
||||
public enum Type {XPath, Regex, Css}
|
||||
|
||||
/**
|
||||
* 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath
|
||||
*
|
||||
* @return 抽取规则类型
|
||||
*/
|
||||
Type type() default Type.XPath;
|
||||
|
||||
/**
|
||||
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
|
||||
*
|
||||
* @return 是否是不能为空的关键字段
|
||||
*/
|
||||
boolean notNull() default false;
|
||||
|
||||
/**
|
||||
* 是否抽取多个结果<br>
|
||||
* 需要List<String>来盛放结果<br>
|
||||
*
|
||||
* @return 是否抽取多个结果
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义类或者字段的抽取规则(从url中抽取,只支持正则表达式)。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface ExtractByUrl{
|
||||
|
||||
/**
|
||||
* 抽取规则,支持正则表达式
|
||||
*
|
||||
* @return 抽取规则
|
||||
*/
|
||||
String value() default "";
|
||||
|
||||
/**
|
||||
* 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false
|
||||
*
|
||||
* @return 是否是不能为空的关键字段
|
||||
*/
|
||||
boolean notNull() default false;
|
||||
|
||||
/**
|
||||
* 是否抽取多个结果<br>
|
||||
* 用于字段时,需要List<String>来盛放结果<br>
|
||||
* 用于类时,表示单页抽取多个对象<br>
|
||||
*
|
||||
* @return 是否抽取多个结果
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义辅助爬取的url。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.TYPE})
|
||||
public @interface HelpUrl {
|
||||
|
||||
/**
|
||||
* 某个类对应的URL规则列表<br>
|
||||
* webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
|
||||
*
|
||||
* @return 抽取规则
|
||||
*/
|
||||
String[] value();
|
||||
|
||||
/**
|
||||
* 指定提取URL的区域(仅支持XPath)
|
||||
* @return 指定提取URL的区域
|
||||
*/
|
||||
String sourceRegion() default "";
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package us.codecraft.webmagic.model.annotation;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-1 <br>
|
||||
* Time: 下午8:40 <br>
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.TYPE})
|
||||
public @interface TargetUrl {
|
||||
|
||||
/**
|
||||
* 某个类对应的URL规则列表<br>
|
||||
* webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
|
||||
*
|
||||
* @return 抽取规则
|
||||
*/
|
||||
String[] value();
|
||||
|
||||
/**
|
||||
* 指定提取URL的区域(仅支持XPath)
|
||||
* @return 指定提取URL的区域
|
||||
*/
|
||||
String sourceRegion() default "";
|
||||
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
webmagic注解抓取方式所定义的注解。
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
webmagic对抓取器编写的面向模型(称为PageModel)的封装。基于POJO及注解即可实现一个PageProcessor。
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,61 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
/**
|
||||
* JSON格式持久化到文件的接口。
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午6:28
|
||||
*/
|
||||
public class JsonFilePipeline implements Pipeline {
|
||||
|
||||
private String path = "/data/webmagic/";
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
/**
|
||||
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
|
||||
*/
|
||||
public JsonFilePipeline() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 新建一个FilePipeline
|
||||
*
|
||||
* @param path 文件保存路径
|
||||
*/
|
||||
public JsonFilePipeline(String path) {
|
||||
if (!path.endsWith("/")&&!path.endsWith("\\")){
|
||||
path+="/";
|
||||
}
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
String path = this.path + "/" + task.getUUID() + "/";
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"));
|
||||
printWriter.write(JSON.toJSONString(resultItems.getAll()));
|
||||
printWriter.close();
|
||||
} catch (IOException e) {
|
||||
logger.warn("write file error", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.PagedModel;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.utils.DoubleKeyMap;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* 用于实现分页的Pipeline。<br>
|
||||
* 在使用redis做分布式爬虫时,请不要使用此功能。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-4 <br>
|
||||
* Time: 下午5:15 <br>
|
||||
*/
|
||||
public class PagedPipeline implements Pipeline {
|
||||
|
||||
private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class);
|
||||
|
||||
private DoubleKeyMap<String, String, PagedModel> objectMap = new DoubleKeyMap<String, String, PagedModel>(ConcurrentHashMap.class);
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
Map<String, Object> resultItemsAll = resultItems.getAll();
|
||||
Iterator<Map.Entry<String, Object>> iterator = resultItemsAll.entrySet().iterator();
|
||||
while (iterator.hasNext()) {
|
||||
handleObject(iterator);
|
||||
}
|
||||
}
|
||||
|
||||
private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
|
||||
Map.Entry<String, Object> objectEntry = iterator.next();
|
||||
Object o = objectEntry.getValue();
|
||||
if (o instanceof PagedModel) {
|
||||
PagedModel pagedModel = (PagedModel) o;
|
||||
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
|
||||
if (pagedModel.getOtherPages() != null) {
|
||||
for (String otherPage : pagedModel.getOtherPages()) {
|
||||
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
|
||||
if (aBoolean == null) {
|
||||
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
|
||||
}
|
||||
}
|
||||
}
|
||||
//check if all pages are processed
|
||||
Map<String, Boolean> booleanMap = pageMap.get(pagedModel.getPageKey());
|
||||
objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel);
|
||||
if (booleanMap == null) {
|
||||
return;
|
||||
}
|
||||
for (Map.Entry<String, Boolean> stringBooleanEntry : booleanMap.entrySet()) {
|
||||
if (!stringBooleanEntry.getValue()) {
|
||||
iterator.remove();
|
||||
return;
|
||||
}
|
||||
}
|
||||
List<Map.Entry<String, PagedModel>> entryList = new ArrayList<Map.Entry<String, PagedModel>>();
|
||||
entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet());
|
||||
if (entryList.size() != 0) {
|
||||
Collections.sort(entryList, new Comparator<Map.Entry<String, PagedModel>>() {
|
||||
@Override
|
||||
public int compare(Map.Entry<String, PagedModel> o1, Map.Entry<String, PagedModel> o2) {
|
||||
try {
|
||||
int i1 = Integer.parseInt(o1.getKey());
|
||||
int i2 = Integer.parseInt(o2.getKey());
|
||||
return i1 - i2;
|
||||
} catch (NumberFormatException e) {
|
||||
return o1.getKey().compareTo(o2.getKey());
|
||||
}
|
||||
}
|
||||
});
|
||||
PagedModel value = entryList.get(0).getValue();
|
||||
for (int i = 1; i < entryList.size(); i++) {
|
||||
value = value.combine(entryList.get(i).getValue());
|
||||
}
|
||||
objectEntry.setValue(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.schedular;
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
|
@ -46,6 +46,9 @@ public class FileCacheQueueScheduler implements Scheduler {
|
|||
private Set<String> urls;
|
||||
|
||||
public FileCacheQueueScheduler(String filePath) {
|
||||
if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){
|
||||
filePath+="/";
|
||||
}
|
||||
this.filePath = filePath;
|
||||
}
|
||||
|
|
@ -1,17 +1,18 @@
|
|||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import redis.clients.jedis.Jedis;
|
||||
import redis.clients.jedis.JedisPool;
|
||||
import redis.clients.jedis.JedisPoolConfig;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.schedular.Scheduler;
|
||||
|
||||
/**
|
||||
* 使用redis管理url,构建一个分布式的爬虫。<br>
|
||||
*
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-25 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-25 <br>
|
||||
* Time: 上午7:07 <br>
|
||||
*/
|
||||
public class RedisScheduler implements Scheduler {
|
||||
|
@ -22,6 +23,8 @@ public class RedisScheduler implements Scheduler {
|
|||
|
||||
private static final String SET_PREFIX = "set_";
|
||||
|
||||
private static final String ITEM_PREFIX = "item_";
|
||||
|
||||
public RedisScheduler(String host) {
|
||||
pool = new JedisPool(new JedisPoolConfig(), host);
|
||||
}
|
||||
|
@ -33,7 +36,12 @@ public class RedisScheduler implements Scheduler {
|
|||
if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) {
|
||||
//使用List保存队列
|
||||
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
|
||||
jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl());
|
||||
jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl());
|
||||
if (request.getExtras() != null) {
|
||||
String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl());
|
||||
byte[] bytes = JSON.toJSONString(request).getBytes();
|
||||
jedis.set(key.getBytes(), bytes);
|
||||
}
|
||||
}
|
||||
pool.returnResource(jedis);
|
||||
}
|
||||
|
@ -42,10 +50,16 @@ public class RedisScheduler implements Scheduler {
|
|||
public synchronized Request poll(Task task) {
|
||||
Jedis jedis = pool.getResource();
|
||||
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
|
||||
pool.returnResource(jedis);
|
||||
if (url==null){
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
String key = ITEM_PREFIX + DigestUtils.shaHex(url);
|
||||
byte[] bytes = jedis.get(key.getBytes());
|
||||
if (bytes != null) {
|
||||
Request o = JSON.parseObject(new String(bytes),Request.class);
|
||||
return o;
|
||||
}
|
||||
pool.returnResource(jedis);
|
||||
return new Request(url);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date Dec 14, 2012
|
||||
*/
|
||||
public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||
private Map<K1, Map<K2, V>> map;
|
||||
|
||||
public DoubleKeyMap() {
|
||||
init();
|
||||
}
|
||||
|
||||
public DoubleKeyMap(Map<K1, Map<K2, V>> map) {
|
||||
this(map,DEFAULT_CLAZZ);
|
||||
}
|
||||
|
||||
public DoubleKeyMap(Class<? extends Map> protoMapClass) {
|
||||
super(protoMapClass);
|
||||
init();
|
||||
}
|
||||
|
||||
private void init() {
|
||||
if (map == null) {
|
||||
map = this.<K1, Map<K2, V>>newMap();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* init map with protoMapClass
|
||||
*
|
||||
* @param protoMapClass
|
||||
*/
|
||||
@SuppressWarnings("rawtypes")
|
||||
public DoubleKeyMap(Map<K1, Map<K2, V>> map, Class<? extends Map> protoMapClass) {
|
||||
super(protoMapClass);
|
||||
this.map = map;
|
||||
init();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key
|
||||
* @return map
|
||||
*/
|
||||
public Map<K2, V> get(K1 key) {
|
||||
return map.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key1
|
||||
* @param key2
|
||||
* @return value
|
||||
*/
|
||||
public V get(K1 key1, K2 key2) {
|
||||
if (get(key1) == null) {
|
||||
return null;
|
||||
}
|
||||
return get(key1).get(key2);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param key1
|
||||
* @param submap
|
||||
* @return
|
||||
*/
|
||||
public V put(K1 key1, Map<K2, V> submap) {
|
||||
return put(key1, submap);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key1
|
||||
* @param key2
|
||||
* @param value
|
||||
* @return
|
||||
*/
|
||||
public V put(K1 key1, K2 key2, V value) {
|
||||
if (map.get(key1) == null) {
|
||||
map.put(key1, this.<K2, V>newMap());
|
||||
}
|
||||
return get(key1).put(key2, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key1
|
||||
* @param key2
|
||||
* @return
|
||||
*/
|
||||
public V remove(K1 key1, K2 key2) {
|
||||
if (get(key1) == null) {
|
||||
return null;
|
||||
}
|
||||
V remove = get(key1).remove(key2);
|
||||
// 如果上一级map为空,把它也回收掉
|
||||
if (get(key1).size() == 0) {
|
||||
remove(key1);
|
||||
}
|
||||
return remove;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key1
|
||||
* @return
|
||||
*/
|
||||
public Map<K2, V> remove(K1 key1) {
|
||||
Map<K2, V> remove = map.remove(key1);
|
||||
return remove;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date Dec 14, 2012
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* multikey map, some basic objects *
|
||||
*
|
||||
* @author yihua.huang
|
||||
*/
|
||||
public abstract class MultiKeyMapBase {
|
||||
|
||||
protected static final Class<? extends Map> DEFAULT_CLAZZ = HashMap.class;
|
||||
@SuppressWarnings("rawtypes")
|
||||
private Class<? extends Map> protoMapClass = DEFAULT_CLAZZ;
|
||||
|
||||
public MultiKeyMapBase() {
|
||||
}
|
||||
|
||||
@SuppressWarnings("rawtypes")
|
||||
public MultiKeyMapBase(Class<? extends Map> protoMapClass) {
|
||||
this.protoMapClass = protoMapClass;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
protected <K, V2> Map<K, V2> newMap() {
|
||||
try {
|
||||
return (Map<K, V2>) protoMapClass.newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
throw new IllegalArgumentException("wrong proto type map "
|
||||
+ protoMapClass);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new IllegalArgumentException("wrong proto type map "
|
||||
+ protoMapClass);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -8,8 +8,8 @@ import us.codecraft.webmagic.Site;
|
|||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-25 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-25 <br>
|
||||
* Time: 上午7:51 <br>
|
||||
*/
|
||||
public class RedisSchedulerTest {
|
||||
|
@ -35,8 +35,11 @@ public class RedisSchedulerTest {
|
|||
return null;
|
||||
}
|
||||
};
|
||||
redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task);
|
||||
Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/");
|
||||
request.putExtra("1","2");
|
||||
redisScheduler.push(request, task);
|
||||
Request poll = redisScheduler.poll(task);
|
||||
System.out.println(poll);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
webmagic-lucene
|
||||
--------
|
||||
尝试将webmagic与lucene结合,打造一个搜索引擎。开发中,不作为webmagic主要模块。
|
|
@ -0,0 +1,37 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.2.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>webmagic-lucene</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers-common</artifactId>
|
||||
<version>4.4.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-queryparser</artifactId>
|
||||
<version>4.4.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,92 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.Version;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-5 <br>
|
||||
* Time: 下午2:11 <br>
|
||||
*/
|
||||
public class LucenePipeline implements Pipeline {
|
||||
|
||||
private Directory directory;
|
||||
|
||||
private Analyzer analyzer;
|
||||
|
||||
private IndexWriterConfig config;
|
||||
|
||||
private void init() throws IOException {
|
||||
analyzer = new StandardAnalyzer(Version.LUCENE_44);
|
||||
directory = new RAMDirectory();
|
||||
config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
|
||||
}
|
||||
|
||||
public LucenePipeline() {
|
||||
try {
|
||||
init();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public List<Document> search(String fieldName, String value) throws IOException, ParseException {
|
||||
List<Document> documents = new ArrayList<Document>();
|
||||
DirectoryReader ireader = DirectoryReader.open(directory);
|
||||
IndexSearcher isearcher = new IndexSearcher(ireader);
|
||||
// Parse a simple query that searches for "text":
|
||||
QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer);
|
||||
Query query = parser.parse(value);
|
||||
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
|
||||
// Iterate through the results:
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
Document hitDoc = isearcher.doc(hits[i].doc);
|
||||
documents.add(hitDoc);
|
||||
}
|
||||
ireader.close();
|
||||
return documents;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
if (resultItems.isSkip()){
|
||||
return;
|
||||
}
|
||||
Document doc = new Document();
|
||||
Map<String,Object> all = resultItems.getAll();
|
||||
if (all==null){
|
||||
return;
|
||||
}
|
||||
for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
|
||||
doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
|
||||
}
|
||||
try {
|
||||
IndexWriter indexWriter = new IndexWriter(directory, config);
|
||||
indexWriter.addDocument(doc);
|
||||
indexWriter.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package us.codecraft.webmagic.lucene;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
import us.codecraft.webmagic.pipeline.LucenePipeline;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-2 <br>
|
||||
* Time: 上午7:52 <br>
|
||||
*/
|
||||
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
||||
public class OschinaBlog {
|
||||
|
||||
@ExtractBy("//title")
|
||||
private String title;
|
||||
|
||||
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
|
||||
private String content;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "OschinaBlog{" +
|
||||
"title='" + title + '\'' +
|
||||
", content='" + content + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
LucenePipeline pipeline = new LucenePipeline();
|
||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync();
|
||||
while (true) {
|
||||
try {
|
||||
List<Document> search = pipeline.search("title", "webmagic");
|
||||
System.out.println(search);
|
||||
Thread.sleep(3000);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} catch (ParseException e) {
|
||||
e.printStackTrace();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public String getContent() {
|
||||
return content;
|
||||
}
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
webmagic-plugin
|
||||
-------
|
||||
webmagic的插件模块。
|
||||
目前仅实现了freemarker模板渲染,和redis实现分布式爬虫。
|
||||
|
||||
另外有一个使用Selenium来动态渲染页面的模块在开发中。
|
|
@ -1,60 +0,0 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import freemarker.template.Configuration;
|
||||
import freemarker.template.Template;
|
||||
import freemarker.template.TemplateException;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-6-8
|
||||
* Time: 下午9:00
|
||||
*/
|
||||
public class FreemarkerPipeline implements Pipeline {
|
||||
|
||||
private Configuration configuration;
|
||||
|
||||
private Template template;
|
||||
|
||||
private String path = "/data/temp/webmagic/ftl/";
|
||||
|
||||
public FreemarkerPipeline(String template, String path) throws IOException {
|
||||
configuration = new Configuration();
|
||||
configuration.setDirectoryForTemplateLoading(new File(this.getClass().getClassLoader().getResource("ftl/").getFile()));
|
||||
this.template = configuration.getTemplate(template);
|
||||
this.path = path;
|
||||
new File(path);
|
||||
}
|
||||
|
||||
public FreemarkerPipeline(String template) throws IOException {
|
||||
this(template, "/data/temp/webmagic/ftl/");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
if (resultItems.isSkip()) {
|
||||
return;
|
||||
}
|
||||
String path = this.path + "" + task.getUUID() + "/";
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
try {
|
||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
|
||||
template.process(resultItems.getAll(), printWriter);
|
||||
printWriter.close();
|
||||
} catch (TemplateException e) {
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
<item>
|
||||
<title>$it.Title</title>
|
||||
<link>http://127.0.0.1/wordpress/?p=$it.Id</link>
|
||||
<pubDate>${date}</pubDate>
|
||||
<dc:creator>admin</dc:creator>
|
||||
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=$it.Id</guid>
|
||||
<description></description>
|
||||
<content:encoded><![CDATA[${text}]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||
<#--<wp:post_id>$it.Id</wp:post_id>-->
|
||||
<wp:post_date>${date}</wp:post_date>
|
||||
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||
<wp:comment_status>open</wp:comment_status>
|
||||
<wp:ping_status>open</wp:ping_status>
|
||||
<wp:post_name>${title}</wp:post_name>
|
||||
<wp:status>publish</wp:status>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
<wp:menu_order>0</wp:menu_order>
|
||||
<wp:post_type>post</wp:post_type>
|
||||
<wp:post_password></wp:post_password>
|
||||
<wp:is_sticky>0</wp:is_sticky>
|
||||
$tags
|
||||
</item>
|
|
@ -1,19 +0,0 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-6-9
|
||||
* Time: 上午7:14
|
||||
*/
|
||||
public class FreemarkerPipelineTest {
|
||||
|
||||
@Test
|
||||
public void testTemplateLoad() throws IOException {
|
||||
new FreemarkerPipeline("wordpress.ftl");
|
||||
}
|
||||
}
|
|
@ -1,3 +0,0 @@
|
|||
webmagic-selenium
|
||||
-------
|
||||
尝试使用selenium来进行页面动态渲染,开发中。
|
|
@ -0,0 +1,3 @@
|
|||
webmagic-samples
|
||||
-------
|
||||
webmagic的一些示例。包括抓取常见博客、信息类网站等。
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<version>0.2.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -19,12 +19,7 @@
|
|||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-misc</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-selenium</artifactId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
@ -33,4 +28,23 @@
|
|||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<version>2.4</version>
|
||||
<configuration>
|
||||
<archive>
|
||||
<manifest>
|
||||
<addClasspath>true</addClasspath>
|
||||
<classpathPrefix>./lib/</classpathPrefix>
|
||||
<mainClass>us.codecraft.webmagic.main.QuickStarter</mainClass>
|
||||
</manifest>
|
||||
</archive>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,70 @@
|
|||
package us.codecraft.webmagic.main;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.samples.IteyeBlog;
|
||||
import us.codecraft.webmagic.model.samples.News163;
|
||||
import us.codecraft.webmagic.model.samples.OschinaBlog;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.PagedPipeline;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Scanner;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-7 <br>
|
||||
* Time: 下午9:24 <br>
|
||||
*/
|
||||
public class QuickStarter {
|
||||
|
||||
private static Map<String, Class> clazzMap;
|
||||
|
||||
private static Map<String, String> urlMap;
|
||||
|
||||
private static void init(){
|
||||
clazzMap = new LinkedHashMap<String, Class>();
|
||||
clazzMap.put("1", OschinaBlog.class);
|
||||
clazzMap.put("2", IteyeBlog.class);
|
||||
clazzMap.put("3", News163.class);
|
||||
urlMap = new LinkedHashMap<String, String>();
|
||||
urlMap.put("1", "http://my.oschina.net/flashsword/blog");
|
||||
urlMap.put("2", "http://flashsword20.iteye.com/");
|
||||
urlMap.put("3", "http://news.163.com/");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
init();
|
||||
String key = null;
|
||||
key = readKey(key);
|
||||
System.out.println("The demo started and will last 20 seconds...");
|
||||
//Start spider
|
||||
OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync();
|
||||
|
||||
try {
|
||||
Thread.sleep(20000);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
System.out.println("The demo stopped!");
|
||||
System.out.println("To more usage, try to customize your own Spider!");
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
private static String readKey(String key) {
|
||||
Scanner stdin = new Scanner(System.in);
|
||||
System.out.println("Choose a Spider demo:");
|
||||
for (Map.Entry<String, Class> classEntry : clazzMap.entrySet()) {
|
||||
System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey()));
|
||||
}
|
||||
while (key == null) {
|
||||
key = new String(stdin.nextLine());
|
||||
if (clazzMap.get(key) == null) {
|
||||
System.out.println("Invalid choice!");
|
||||
key = null;
|
||||
}
|
||||
}
|
||||
return key;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-2 <br>
|
||||
* Time: 上午8:10 <br>
|
||||
*/
|
||||
public interface Blog {
|
||||
|
||||
public String getTitle();
|
||||
|
||||
public String getContent();
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-2 <br>
|
||||
* Time: 上午7:52 <br>
|
||||
*/
|
||||
@TargetUrl("http://*.iteye.com/blog/*")
|
||||
public class IteyeBlog implements Blog{
|
||||
|
||||
@ExtractBy("//title")
|
||||
private String title;
|
||||
|
||||
@ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css)
|
||||
private String content;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "IteyeBlog{" +
|
||||
"title='" + title + '\'' +
|
||||
", content='" + content + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run();
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public String getContent() {
|
||||
return content;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.PagedModel;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.*;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy2;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.PagedPipeline;
|
||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-4 <br>
|
||||
* Time: 下午8:17 <br>
|
||||
*/
|
||||
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
|
||||
public class News163 implements PagedModel {
|
||||
|
||||
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
|
||||
private String pageKey;
|
||||
|
||||
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
|
||||
private String page;
|
||||
|
||||
@ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false)
|
||||
@ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex)
|
||||
private List<String> otherPage;
|
||||
|
||||
@ExtractBy("//h1[@id=\"h1title\"]/text()")
|
||||
private String title;
|
||||
|
||||
@ExtractBy("//div[@id=\"epContentLeft\"]")
|
||||
private String content;
|
||||
|
||||
@Override
|
||||
public String getPageKey() {
|
||||
return pageKey;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> getOtherPages() {
|
||||
return otherPage;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPage() {
|
||||
if (page == null) {
|
||||
return "1";
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
@Override
|
||||
public PagedModel combine(PagedModel pagedModel) {
|
||||
News163 news163 = new News163();
|
||||
news163.title = this.title;
|
||||
News163 pagedModel1 = (News163) pagedModel;
|
||||
news163.content = this.content + pagedModel1.content;
|
||||
return news163;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "News163{" +
|
||||
"content='" + content + '\'' +
|
||||
", title='" + title + '\'' +
|
||||
", otherPage=" + otherPage +
|
||||
'}';
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
|
||||
.scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.*;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.model.annotation.HelpUrl;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-3 <br>
|
||||
* Time: 下午8:25 <br>
|
||||
*/
|
||||
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
|
||||
@HelpUrl("http://www.oschina.net/question/*")
|
||||
@ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true)
|
||||
public class OschinaAnswer implements AfterExtractor{
|
||||
|
||||
@ExtractBy("//img/@title")
|
||||
private String user;
|
||||
|
||||
@ExtractBy("//div[@class='detail']")
|
||||
private String content;
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterProcess(Page page) {
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-2 <br>
|
||||
* Time: 上午7:52 <br>
|
||||
*/
|
||||
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
||||
public class OschinaBlog {
|
||||
|
||||
@ExtractBy("//title")
|
||||
private String title;
|
||||
|
||||
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
|
||||
private String content;
|
||||
|
||||
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
||||
private List<String> tags;
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
|
||||
,new ConsolePageModelPipeline(), OschinaBlog.class).run();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Author yihua.huang@dianping.com
|
||||
* Date: 13-6-24
|
||||
* Time: 下午2:12
|
||||
*/
|
||||
public class GlobalProcessor implements PageProcessor {
|
||||
|
||||
private Site site;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
final List<String> requests = page.getHtml().links().all();
|
||||
page.addTargetRequests(requests);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
if (site == null) {
|
||||
site = Site.me().setDomain("www.2345.com").setSleepTime(0)
|
||||
.addStartUrl("http://www.2345.com/").addStartUrl("http://hao.360.cn/")
|
||||
.addStartUrl("http://www.baidu.com/s?wd=%E7%BD%91%E7%AB%99%E5%AF%BC%E8%88%AA&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=80039098_oem_dg&rsv_n=2&rsv_sug3=6&rsv_sug4=698&rsv_sug=0&rsv_sug1=3")
|
||||
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
}
|
||||
return site;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new GlobalProcessor()).thread(10)
|
||||
.scheduler(new RedisScheduler("localhost"))
|
||||
.pipeline(new FilePipeline("/data/webmagic/test/"))
|
||||
.runAsync();
|
||||
Spider.create(new GlobalProcessor()).thread(10)
|
||||
.scheduler(new RedisScheduler("localhost"))
|
||||
.pipeline(new FilePipeline("/data/webmagic/test/"))
|
||||
.run();
|
||||
}
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-14 <br>
|
||||
* Time: 上午8:33 <br>
|
||||
*/
|
||||
public class GuoxueProcessor {
|
||||
|
||||
public static void main(String[] args) {
|
||||
SimplePageProcessor simplePageProcessor = new SimplePageProcessor("http://www.guoxue123.cn/", "http://www.guoxue123.cn/*");
|
||||
simplePageProcessor.getSite().setCharset("GBK").setSleepTime(500);
|
||||
Spider.create(simplePageProcessor).pipeline(new FilePipeline("/data/webmagic/")).scheduler(new FileCacheQueueScheduler("/data/webmagic/")).run();
|
||||
}
|
||||
}
|
|
@ -7,8 +7,8 @@ import us.codecraft.webmagic.pipeline.FilePipeline;
|
|||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-26 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 上午7:31 <br>
|
||||
*/
|
||||
public class IteyeBlogProcessor implements PageProcessor {
|
||||
|
|
|
@ -2,6 +2,8 @@ package us.codecraft.webmagic.samples;
|
|||
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -13,18 +15,24 @@ import java.util.List;
|
|||
*/
|
||||
public class OschinaBlogPageProcesser implements PageProcessor {
|
||||
|
||||
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all();
|
||||
page.addTargetRequests(strings);
|
||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1"));
|
||||
page.putField("content", page.getHtml().smartContent());
|
||||
page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+"));
|
||||
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
||||
page.addTargetRequests(links);
|
||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
|
||||
page.putField("content", page.getHtml().$("div.content").toString());
|
||||
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/").
|
||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||
return site;
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ import org.junit.Test;
|
|||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||
import us.codecraft.webmagic.samples.HuxiuProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-6-9
|
||||
* Time: 上午8:02
|
||||
*/
|
||||
public class DiandianProcessorTest {
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor();
|
||||
//pipeline是抓取结束后的处理
|
||||
//ftl文件放到classpath:ftl/文件夹下
|
||||
//默认放到/data/temp/webmagic/ftl/[domain]目录下
|
||||
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||
//Spider.me()是简化写法,其实就是new一个啦
|
||||
//Spider.pipeline()设定一个pipeline,支持链式调用
|
||||
//ConsolePipeline输出结果到控制台
|
||||
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
|
||||
//Spider.run()执行
|
||||
|
||||
Spider.create(diaoyuwengProcessor).pipeline(new ConsolePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||
run();
|
||||
}
|
||||
}
|
|
@ -4,9 +4,9 @@ import org.junit.Ignore;
|
|||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
||||
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -21,7 +21,7 @@ public class DiaoyuwengProcessorTest {
|
|||
@Test
|
||||
public void test() throws IOException {
|
||||
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
||||
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
|
||||
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||
run();
|
||||
}
|
||||
|
|
|
@ -4,9 +4,9 @@ import org.junit.Ignore;
|
|||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
||||
import us.codecraft.webmagic.samples.SinaBlogProcesser;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -22,9 +22,8 @@ public class SinablogProcessorTest {
|
|||
public void test() throws IOException {
|
||||
SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser();
|
||||
//pipeline是抓取结束后的处理
|
||||
//ftl文件放到classpath:ftl/文件夹下
|
||||
//默认放到/data/temp/webmagic/ftl/[domain]目录下
|
||||
FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||
//默认放到/data/webmagic/ftl/[domain]目录下
|
||||
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
|
||||
//Spider.me()是简化写法,其实就是new一个啦
|
||||
//Spider.pipeline()设定一个pipeline,支持链式调用
|
||||
//ConsolePipeline输出结果到控制台
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
webmagic-extension
|
||||
-------
|
||||
webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。
|
|
@ -5,16 +5,11 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<version>0.2.0</version>
|
||||
</parent>
|
||||
<packaging>pom</packaging>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<modules>
|
||||
<module>webmagic-misc</module>
|
||||
<module>webmagic-selenium</module>
|
||||
</modules>
|
||||
|
||||
<artifactId>webmagic-plugin</artifactId>
|
||||
<artifactId>webmagic-saxon</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -22,6 +17,10 @@
|
|||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
|
@ -0,0 +1,178 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import net.sf.saxon.lib.NamespaceConstant;
|
||||
import net.sf.saxon.xpath.XPathEvaluator;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.htmlcleaner.CleanerProperties;
|
||||
import org.htmlcleaner.DomSerializer;
|
||||
import org.htmlcleaner.HtmlCleaner;
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
import javax.xml.namespace.NamespaceContext;
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpression;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午9:39
|
||||
*/
|
||||
public class Xpath2Selector implements Selector {
|
||||
|
||||
private String xpathStr;
|
||||
|
||||
private XPathExpression xPathExpression;
|
||||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
public Xpath2Selector(String xpathStr) {
|
||||
this.xpathStr = xpathStr;
|
||||
try {
|
||||
init();
|
||||
} catch (XPathExpressionException e) {
|
||||
throw new IllegalArgumentException("XPath error!", e);
|
||||
}
|
||||
}
|
||||
|
||||
enum XPath2NamespaceContext implements NamespaceContext {
|
||||
|
||||
INSTANCE;
|
||||
|
||||
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<String, String>();
|
||||
|
||||
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<String, List<String>>();
|
||||
|
||||
private void put(String prefix, String namespaceURI) {
|
||||
prefix2NamespaceMap.put(prefix, namespaceURI);
|
||||
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
|
||||
if (prefixes == null) {
|
||||
prefixes = new ArrayList<String>();
|
||||
namespace2PrefixMap.put(namespaceURI, prefixes);
|
||||
}
|
||||
prefixes.add(prefix);
|
||||
}
|
||||
|
||||
private XPath2NamespaceContext() {
|
||||
put("fn", NamespaceConstant.FN);
|
||||
put("xslt", NamespaceConstant.XSLT);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getNamespaceURI(String prefix) {
|
||||
return prefix2NamespaceMap.get(prefix);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPrefix(String namespaceURI) {
|
||||
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
|
||||
if (prefixes == null || prefixes.size() < 1) {
|
||||
return null;
|
||||
}
|
||||
return prefixes.get(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator getPrefixes(String namespaceURI) {
|
||||
List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
|
||||
if (prefixes == null || prefixes.size() < 1) {
|
||||
return null;
|
||||
}
|
||||
return prefixes.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
private void init() throws XPathExpressionException {
|
||||
XPathEvaluator xPathEvaluator = new XPathEvaluator();
|
||||
xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE);
|
||||
xPathExpression = xPathEvaluator.compile(xpathStr);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
try {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean(text);
|
||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||
Object result;
|
||||
try {
|
||||
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
|
||||
} catch (XPathExpressionException e) {
|
||||
result = xPathExpression.evaluate(document, XPathConstants.STRING);
|
||||
}
|
||||
if (result instanceof NodeList) {
|
||||
NodeList nodeList = (NodeList) result;
|
||||
if (nodeList.getLength() == 0) {
|
||||
return null;
|
||||
}
|
||||
Node item = nodeList.item(0);
|
||||
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
|
||||
return item.getTextContent();
|
||||
} else {
|
||||
StreamResult xmlOutput = new StreamResult(new StringWriter());
|
||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
||||
transformer.transform(new DOMSource(item), xmlOutput);
|
||||
return xmlOutput.getWriter().toString();
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> selectList(String text) {
|
||||
List<String> results = new ArrayList<String>();
|
||||
try {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean(text);
|
||||
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
|
||||
Object result;
|
||||
try {
|
||||
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
|
||||
} catch (XPathExpressionException e) {
|
||||
result = xPathExpression.evaluate(document, XPathConstants.STRING);
|
||||
}
|
||||
if (result instanceof NodeList) {
|
||||
NodeList nodeList = (NodeList) result;
|
||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
||||
StreamResult xmlOutput = new StreamResult();
|
||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
Node item = nodeList.item(i);
|
||||
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
|
||||
results.add(item.getTextContent());
|
||||
} else {
|
||||
xmlOutput.setWriter(new StringWriter());
|
||||
transformer.transform(new DOMSource(item), xmlOutput);
|
||||
results.add(xmlOutput.getWriter().toString());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
results.add(result.toString());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("select text error! " + xpathStr, e);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
|
@ -1168,7 +1169,7 @@ public class XpathSelectorTest {
|
|||
+ " var location = window.location;\n"
|
||||
+ " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n"
|
||||
+ " pre.writeAttribute('codeable_id', post_id);\n"
|
||||
+ " pre.writeAttribute('codeable_type', \"Blog\");\n"
|
||||
+ " pre.writeAttribute('codeable_type', \"OschinaBlog\");\n"
|
||||
+ " pre.writeAttribute('source_url', source_url);\n"
|
||||
+ " pre.writeAttribute('pre_index', index);\n"
|
||||
+ " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n"
|
||||
|
@ -1354,4 +1355,41 @@ public class XpathSelectorTest {
|
|||
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testXPath2() {
|
||||
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
|
||||
"<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>";
|
||||
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
|
||||
System.out.println(xpathSelector.select(text));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testXpath2Selector() {
|
||||
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
|
||||
String select = xpath2Selector.select(html);
|
||||
Assert.assertNotNull(select);
|
||||
}
|
||||
|
||||
@Ignore("take long time")
|
||||
@Test
|
||||
public void performanceTest() {
|
||||
Xpath2Selector xpath2Selector = new Xpath2Selector("//a");
|
||||
long time =System.currentTimeMillis();
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
xpath2Selector.selectList(html);
|
||||
}
|
||||
System.out.println(System.currentTimeMillis()-time);
|
||||
XpathSelector xpathSelector = new XpathSelector("//a");
|
||||
time =System.currentTimeMillis();
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
xpathSelector.selectList(html);
|
||||
}
|
||||
System.out.println(System.currentTimeMillis()-time);
|
||||
time =System.currentTimeMillis();
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
xpath2Selector.selectList(html);
|
||||
}
|
||||
System.out.println(System.currentTimeMillis()-time);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
webmagic-extension
|
||||
-------
|
||||
webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。
|
|
@ -2,13 +2,13 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
|
||||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-plugin</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<artifactId>webmagic</artifactId>
|
||||
<version>0.2.0</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>webmagic-selenium</artifactId>
|
||||
|
||||
<dependencies>
|
||||
|
@ -17,7 +17,15 @@
|
|||
<artifactId>selenium-java</artifactId>
|
||||
<version>2.33.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.selenium.downloader;
|
||||
package us.codecraft.webmagic.downloader.selenium;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.openqa.selenium.By;
|
||||
|
@ -21,8 +21,8 @@ import java.util.Map;
|
|||
* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。<br>
|
||||
* 需要下载Selenium driver支持。<br>
|
||||
*
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-26 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 下午1:37 <br>
|
||||
*/
|
||||
public class SeleniumDownloader implements Downloader, Destroyable {
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.selenium.downloader;
|
||||
package us.codecraft.webmagic.downloader.selenium;
|
||||
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
|
@ -11,8 +11,8 @@ import java.util.concurrent.LinkedBlockingDeque;
|
|||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-26 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 下午1:41 <br>
|
||||
*/
|
||||
class WebDriverPool {
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.selenium;
|
||||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
@ -13,8 +13,8 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-26 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 下午12:27 <br>
|
||||
*/
|
||||
public class SeleniumTest {
|
|
@ -1,4 +1,4 @@
|
|||
package us.codecraft.webmagic.selenium.downloader;
|
||||
package us.codecraft.webmagic.downloader.selenium;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
@ -8,8 +8,8 @@ import us.codecraft.webmagic.Site;
|
|||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-26 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 下午2:46 <br>
|
||||
*/
|
||||
public class SeleniumDownloaderTest {
|
|
@ -1,12 +1,12 @@
|
|||
package us.codecraft.webmagic.selenium.downloader;
|
||||
package us.codecraft.webmagic.downloader.selenium;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-26 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 下午2:12 <br>
|
||||
*/
|
||||
public class WebDriverPoolTest {
|
|
@ -3,16 +3,15 @@ package us.codecraft.webmagic.samples;
|
|||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||
import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader;
|
||||
|
||||
/**
|
||||
* 花瓣网抽取器。<br>
|
||||
* 使用Selenium做页面动态渲染。<br>
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-26 <br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-7-26 <br>
|
||||
* Time: 下午4:08 <br>
|
||||
*/
|
||||
public class HuabanProcessor implements PageProcessor {
|
||||
|
@ -39,7 +38,6 @@ public class HuabanProcessor implements PageProcessor {
|
|||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new HuabanProcessor()).thread(5)
|
||||
.scheduler(new RedisScheduler("localhost"))
|
||||
.pipeline(new FilePipeline("/data/webmagic/test/"))
|
||||
.downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
|
||||
.runAsync();
|
Loading…
Reference in New Issue