merege
parent
4f84b5f8d6
commit
57556ab879
18
README.md
18
README.md
|
@ -4,7 +4,7 @@ webmagic
|
||||||
|
|
||||||
[](https://travis-ci.org/code4craft/webmagic)
|
[](https://travis-ci.org/code4craft/webmagic)
|
||||||
|
|
||||||
>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simply the development of a specific crawler.
|
>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.
|
||||||
|
|
||||||
## Features:
|
## Features:
|
||||||
|
|
||||||
|
@ -17,23 +17,17 @@ webmagic
|
||||||
|
|
||||||
## Install:
|
## Install:
|
||||||
|
|
||||||
Clone the repo and build:
|
Add dependencies to your pom.xml:
|
||||||
|
|
||||||
git clone https://github.com/code4craft/webmagic.git
|
|
||||||
cd webmagic
|
|
||||||
mvn clean install
|
|
||||||
|
|
||||||
Add dependencies to your project:
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.2.0</version>
|
<version>0.3.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.2.0</version>
|
<version>0.3.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
## Get Started:
|
## Get Started:
|
||||||
|
@ -42,6 +36,7 @@ Add dependencies to your project:
|
||||||
|
|
||||||
Write a class implements PageProcessor:
|
Write a class implements PageProcessor:
|
||||||
|
|
||||||
|
```java
|
||||||
public class OschinaBlogPageProcesser implements PageProcessor {
|
public class OschinaBlogPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
private Site site = Site.me().setDomain("my.oschina.net")
|
private Site site = Site.me().setDomain("my.oschina.net")
|
||||||
|
@ -67,6 +62,7 @@ Write a class implements PageProcessor:
|
||||||
.pipeline(new ConsolePipeline()).run();
|
.pipeline(new ConsolePipeline()).run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
```
|
||||||
|
|
||||||
* `page.addTargetRequests(links)`
|
* `page.addTargetRequests(links)`
|
||||||
|
|
||||||
|
@ -74,6 +70,7 @@ Write a class implements PageProcessor:
|
||||||
|
|
||||||
You can also use annotation way:
|
You can also use annotation way:
|
||||||
|
|
||||||
|
```java
|
||||||
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
||||||
public class OschinaBlog {
|
public class OschinaBlog {
|
||||||
|
|
||||||
|
@ -92,6 +89,7 @@ You can also use annotation way:
|
||||||
new ConsolePageModelPipeline(), OschinaBlog.class).run();
|
new ConsolePageModelPipeline(), OschinaBlog.class).run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
```
|
||||||
|
|
||||||
### Docs and samples:
|
### Docs and samples:
|
||||||
|
|
||||||
|
|
14
pom.xml
14
pom.xml
|
@ -6,9 +6,13 @@
|
||||||
<version>7</version>
|
<version>7</version>
|
||||||
</parent>
|
</parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.2.1</version>
|
<version>0.3.1-SNAPSHOT</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||||
|
</properties>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<name>webmagic-parent</name>
|
<name>webmagic-parent</name>
|
||||||
<description>
|
<description>
|
||||||
|
@ -32,7 +36,7 @@
|
||||||
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
|
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
|
||||||
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
|
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
|
||||||
<url>git@github.com:code4craft/webmagic.git</url>
|
<url>git@github.com:code4craft/webmagic.git</url>
|
||||||
<tag>webmagic-parent-0.2.1</tag>
|
<tag>HEAD</tag>
|
||||||
</scm>
|
</scm>
|
||||||
<licenses>
|
<licenses>
|
||||||
<license>
|
<license>
|
||||||
|
@ -44,7 +48,6 @@
|
||||||
<modules>
|
<modules>
|
||||||
<module>webmagic-core</module>
|
<module>webmagic-core</module>
|
||||||
<module>webmagic-extension/</module>
|
<module>webmagic-extension/</module>
|
||||||
<module>webmagic-samples/</module>
|
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
@ -60,6 +63,11 @@
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
<version>4.2.4</version>
|
<version>4.2.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>xsoup</artifactId>
|
||||||
|
<version>0.1.0</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>net.sf.saxon</groupId>
|
<groupId>net.sf.saxon</groupId>
|
||||||
<artifactId>Saxon-HE</artifactId>
|
<artifactId>Saxon-HE</artifactId>
|
||||||
|
|
|
@ -1,5 +1,19 @@
|
||||||
Release Notes
|
Release Notes
|
||||||
----
|
----
|
||||||
|
*2012-9-4* `version:0.3.0`
|
||||||
|
|
||||||
|
* Change default XPath selector from HtmlCleaner to [Xsoup](https://github.com/code4craft/xsoup).
|
||||||
|
|
||||||
|
[Xsoup](https://github.com/code4craft/xsoup) is an XPath selector based on Jsoup written by me. It has much better performance than HtmlCleaner.
|
||||||
|
|
||||||
|
Time of processing a page is reduced from 7~9ms to 0.4ms.
|
||||||
|
|
||||||
|
If Xsoup is not stable for your usage, just use `Spider.xsoupOff()` to turn off it and report an issue to me!
|
||||||
|
|
||||||
|
* Add cycle retry times for Site.
|
||||||
|
|
||||||
|
When cycle retry times is set, Spider will put the url which downloading failed back to scheduler, and retry after a cycle of queue.
|
||||||
|
|
||||||
*2012-8-20* `version:0.2.1`
|
*2012-8-20* `version:0.2.1`
|
||||||
|
|
||||||
ComboExtractor support for annotation.
|
ComboExtractor support for annotation.
|
||||||
|
|
|
@ -21,22 +21,17 @@ webmagic使用手册
|
||||||
|
|
||||||
### 使用maven
|
### 使用maven
|
||||||
|
|
||||||
webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译:
|
webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic:
|
||||||
|
|
||||||
git clone https://github.com/code4craft/webmagic.git
|
|
||||||
mvn clean install
|
|
||||||
|
|
||||||
安装后,在项目中添加对应的依赖即可使用webmagic:
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.2.0</version>
|
<version>0.2.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.2.0</version>
|
<version>0.2.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
#### 项目结构
|
#### 项目结构
|
||||||
|
@ -51,7 +46,7 @@ webmagic主要包括两个包:
|
||||||
|
|
||||||
webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。
|
webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。
|
||||||
|
|
||||||
webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来:
|
webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来,这些包需要下载源码后自己编译:
|
||||||
|
|
||||||
* **webmagic-saxon**
|
* **webmagic-saxon**
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,156 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project name="module_webmagic-core" default="compile.module.webmagic-core">
|
||||||
|
<dirname property="module.webmagic-core.basedir" file="${ant.file.module_webmagic-core}"/>
|
||||||
|
|
||||||
|
<property name="module.jdk.home.webmagic-core" value="${project.jdk.home}"/>
|
||||||
|
<property name="module.jdk.bin.webmagic-core" value="${project.jdk.bin}"/>
|
||||||
|
<property name="module.jdk.classpath.webmagic-core" value="${project.jdk.classpath}"/>
|
||||||
|
|
||||||
|
<property name="compiler.args.webmagic-core" value="${compiler.args}"/>
|
||||||
|
|
||||||
|
<property name="webmagic-core.output.dir" value="${module.webmagic-core.basedir}/target/classes"/>
|
||||||
|
<property name="webmagic-core.testoutput.dir" value="${module.webmagic-core.basedir}/target/test-classes"/>
|
||||||
|
|
||||||
|
<path id="webmagic-core.module.bootclasspath">
|
||||||
|
<!-- Paths to be included in compilation bootclasspath -->
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="webmagic-core.module.production.classpath">
|
||||||
|
<path refid="${module.jdk.classpath.webmagic-core}"/>
|
||||||
|
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
|
||||||
|
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
|
||||||
|
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
|
||||||
|
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
|
||||||
|
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="webmagic-core.runtime.production.module.classpath">
|
||||||
|
<pathelement location="${webmagic-core.output.dir}"/>
|
||||||
|
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
|
||||||
|
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
|
||||||
|
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
|
||||||
|
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
|
||||||
|
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="webmagic-core.module.classpath">
|
||||||
|
<path refid="${module.jdk.classpath.webmagic-core}"/>
|
||||||
|
<pathelement location="${webmagic-core.output.dir}"/>
|
||||||
|
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
|
||||||
|
<path refid="library.maven:_junit:junit:4.7.classpath"/>
|
||||||
|
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
|
||||||
|
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
|
||||||
|
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
|
||||||
|
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="webmagic-core.runtime.module.classpath">
|
||||||
|
<pathelement location="${webmagic-core.testoutput.dir}"/>
|
||||||
|
<pathelement location="${webmagic-core.output.dir}"/>
|
||||||
|
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
|
||||||
|
<path refid="library.maven:_junit:junit:4.7.classpath"/>
|
||||||
|
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
|
||||||
|
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
|
||||||
|
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
|
||||||
|
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
|
||||||
|
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
|
||||||
|
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
|
||||||
|
<patternset id="excluded.from.module.webmagic-core">
|
||||||
|
<patternset refid="ignored.files"/>
|
||||||
|
</patternset>
|
||||||
|
|
||||||
|
<patternset id="excluded.from.compilation.webmagic-core">
|
||||||
|
<patternset refid="excluded.from.module.webmagic-core"/>
|
||||||
|
</patternset>
|
||||||
|
|
||||||
|
<path id="webmagic-core.module.sourcepath">
|
||||||
|
<dirset dir="${module.webmagic-core.basedir}">
|
||||||
|
<include name="src/main/java"/>
|
||||||
|
<include name="src/main/resources"/>
|
||||||
|
</dirset>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="webmagic-core.module.test.sourcepath">
|
||||||
|
<dirset dir="${module.webmagic-core.basedir}">
|
||||||
|
<include name="src/test/java"/>
|
||||||
|
<include name="src/test/resources"/>
|
||||||
|
</dirset>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
|
||||||
|
<target name="compile.module.webmagic-core" depends="compile.module.webmagic-core.production,compile.module.webmagic-core.tests" description="Compile module webmagic-core"/>
|
||||||
|
|
||||||
|
<target name="compile.module.webmagic-core.production" depends="register.custom.compilers" description="Compile module webmagic-core; production classes">
|
||||||
|
<mkdir dir="${webmagic-core.output.dir}"/>
|
||||||
|
<javac2 destdir="${webmagic-core.output.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
|
||||||
|
<compilerarg line="${compiler.args.webmagic-core}"/>
|
||||||
|
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
|
||||||
|
<classpath refid="webmagic-core.module.production.classpath"/>
|
||||||
|
<src refid="webmagic-core.module.sourcepath"/>
|
||||||
|
<patternset refid="excluded.from.compilation.webmagic-core"/>
|
||||||
|
</javac2>
|
||||||
|
|
||||||
|
<copy todir="${webmagic-core.output.dir}">
|
||||||
|
<fileset dir="${module.webmagic-core.basedir}/src/main/java">
|
||||||
|
<patternset refid="compiler.resources"/>
|
||||||
|
<type type="file"/>
|
||||||
|
</fileset>
|
||||||
|
<fileset dir="${module.webmagic-core.basedir}/src/main/resources">
|
||||||
|
<patternset refid="compiler.resources"/>
|
||||||
|
<type type="file"/>
|
||||||
|
</fileset>
|
||||||
|
</copy>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="compile.module.webmagic-core.tests" depends="register.custom.compilers,compile.module.webmagic-core.production" description="compile module webmagic-core; test classes" unless="skip.tests">
|
||||||
|
<mkdir dir="${webmagic-core.testoutput.dir}"/>
|
||||||
|
<javac2 destdir="${webmagic-core.testoutput.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
|
||||||
|
<compilerarg line="${compiler.args.webmagic-core}"/>
|
||||||
|
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
|
||||||
|
<classpath refid="webmagic-core.module.classpath"/>
|
||||||
|
<src refid="webmagic-core.module.test.sourcepath"/>
|
||||||
|
<patternset refid="excluded.from.compilation.webmagic-core"/>
|
||||||
|
</javac2>
|
||||||
|
|
||||||
|
<copy todir="${webmagic-core.testoutput.dir}">
|
||||||
|
<fileset dir="${module.webmagic-core.basedir}/src/test/java">
|
||||||
|
<patternset refid="compiler.resources"/>
|
||||||
|
<type type="file"/>
|
||||||
|
</fileset>
|
||||||
|
<fileset dir="${module.webmagic-core.basedir}/src/test/resources">
|
||||||
|
<patternset refid="compiler.resources"/>
|
||||||
|
<type type="file"/>
|
||||||
|
</fileset>
|
||||||
|
</copy>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="clean.module.webmagic-core" description="cleanup module">
|
||||||
|
<delete dir="${webmagic-core.output.dir}"/>
|
||||||
|
<delete dir="${webmagic-core.testoutput.dir}"/>
|
||||||
|
</target>
|
||||||
|
</project>
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.2.1</version>
|
<version>0.3.1-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -25,6 +25,11 @@
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>xsoup</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>log4j</groupId>
|
<groupId>log4j</groupId>
|
||||||
<artifactId>log4j</artifactId>
|
<artifactId>log4j</artifactId>
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import us.codecraft.webmagic.selector.Html;
|
||||||
import us.codecraft.webmagic.selector.Selectable;
|
import us.codecraft.webmagic.selector.Selectable;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
@ -28,7 +29,7 @@ public class Page {
|
||||||
|
|
||||||
private ResultItems resultItems = new ResultItems();
|
private ResultItems resultItems = new ResultItems();
|
||||||
|
|
||||||
private Selectable html;
|
private Html html;
|
||||||
|
|
||||||
private Selectable url;
|
private Selectable url;
|
||||||
|
|
||||||
|
@ -58,11 +59,11 @@ public class Page {
|
||||||
*
|
*
|
||||||
* @return html
|
* @return html
|
||||||
*/
|
*/
|
||||||
public Selectable getHtml() {
|
public Html getHtml() {
|
||||||
return html;
|
return html;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setHtml(Selectable html) {
|
public void setHtml(Html html) {
|
||||||
this.html = html;
|
this.html = html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -87,6 +88,23 @@ public class Page {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* add urls to fetch
|
||||||
|
*
|
||||||
|
* @param requests
|
||||||
|
*/
|
||||||
|
public void addTargetRequests(List<String> requests,long priority) {
|
||||||
|
synchronized (targetRequests) {
|
||||||
|
for (String s : requests) {
|
||||||
|
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||||
|
targetRequests.add(new Request(s).setPriority(priority));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add url to fetch
|
* add url to fetch
|
||||||
*
|
*
|
||||||
|
|
|
@ -17,6 +17,8 @@ public class Request implements Serializable {
|
||||||
|
|
||||||
private static final long serialVersionUID = 2062192774891352043L;
|
private static final long serialVersionUID = 2062192774891352043L;
|
||||||
|
|
||||||
|
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
||||||
|
|
||||||
private String url;
|
private String url;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -30,6 +30,8 @@ public class Site {
|
||||||
|
|
||||||
private int retryTimes = 0;
|
private int retryTimes = 0;
|
||||||
|
|
||||||
|
private int cycleRetryTimes = 0;
|
||||||
|
|
||||||
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
|
||||||
|
|
||||||
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
|
||||||
|
@ -200,7 +202,7 @@ public class Site {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get retry times when download fail, 0 by default.<br>
|
* Get retry times when download fail immediately, 0 by default.<br>
|
||||||
*
|
*
|
||||||
* @return retry times when download fail
|
* @return retry times when download fail
|
||||||
*/
|
*/
|
||||||
|
@ -218,6 +220,25 @@ public class Site {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When cycleRetryTimes is more than 0, it will add back to scheduler and try download again. <br>
|
||||||
|
*
|
||||||
|
* @return retry times when download fail
|
||||||
|
*/
|
||||||
|
public int getCycleRetryTimes() {
|
||||||
|
return cycleRetryTimes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set cycleRetryTimes times when download fail, 0 by default. Only work in RedisScheduler. <br>
|
||||||
|
*
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
|
public Site setCycleRetryTimes(int cycleRetryTimes) {
|
||||||
|
this.cycleRetryTimes = cycleRetryTimes;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) return true;
|
||||||
|
|
|
@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||||
import us.codecraft.webmagic.scheduler.Scheduler;
|
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||||
|
import us.codecraft.webmagic.utils.EnvironmentUtil;
|
||||||
import us.codecraft.webmagic.utils.ThreadUtils;
|
import us.codecraft.webmagic.utils.ThreadUtils;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
|
@ -309,6 +310,12 @@ public class Spider implements Runnable, Task {
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
//for cycle retry
|
||||||
|
if (page.getHtml()==null){
|
||||||
|
addRequest(page);
|
||||||
|
sleep(site.getSleepTime());
|
||||||
|
return;
|
||||||
|
}
|
||||||
pageProcessor.process(page);
|
pageProcessor.process(page);
|
||||||
addRequest(page);
|
addRequest(page);
|
||||||
if (!page.getResultItems().isSkip()) {
|
if (!page.getResultItems().isSkip()) {
|
||||||
|
@ -368,6 +375,14 @@ public class Spider implements Runnable, Task {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* switch off xsoup
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static void xsoupOff(){
|
||||||
|
EnvironmentUtil.setUseXsoup(false);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getUUID() {
|
public String getUUID() {
|
||||||
if (uuid != null) {
|
if (uuid != null) {
|
||||||
|
|
|
@ -46,6 +46,17 @@ public class HttpClientDownloader implements Downloader {
|
||||||
return (Html) page.getHtml();
|
return (Html) page.getHtml();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A simple method to download a url.
|
||||||
|
*
|
||||||
|
* @param url
|
||||||
|
* @return html
|
||||||
|
*/
|
||||||
|
public Html download(String url, String charset) {
|
||||||
|
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
|
||||||
|
return (Html) page.getHtml();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Page download(Request request, Task task) {
|
public Page download(Request request, Task task) {
|
||||||
Site site = null;
|
Site site = null;
|
||||||
|
@ -79,6 +90,21 @@ public class HttpClientDownloader implements Downloader {
|
||||||
|
|
||||||
if (tried > retryTimes) {
|
if (tried > retryTimes) {
|
||||||
logger.warn("download page " + request.getUrl() + " error", e);
|
logger.warn("download page " + request.getUrl() + " error", e);
|
||||||
|
if (site.getCycleRetryTimes() > 0) {
|
||||||
|
Page page = new Page();
|
||||||
|
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||||
|
if (cycleTriedTimesObject == null) {
|
||||||
|
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||||
|
} else {
|
||||||
|
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
|
||||||
|
cycleTriedTimes++;
|
||||||
|
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
|
||||||
|
}
|
||||||
|
return page;
|
||||||
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
|
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");
|
||||||
|
@ -87,13 +113,12 @@ public class HttpClientDownloader implements Downloader {
|
||||||
} while (retry);
|
} while (retry);
|
||||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
if (acceptStatCode.contains(statusCode)) {
|
if (acceptStatCode.contains(statusCode)) {
|
||||||
|
handleGzip(httpResponse);
|
||||||
//charset
|
//charset
|
||||||
if (charset == null) {
|
if (charset == null) {
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
charset = UrlUtils.getCharset(value);
|
charset = UrlUtils.getCharset(value);
|
||||||
}
|
}
|
||||||
//
|
|
||||||
handleGzip(httpResponse);
|
|
||||||
return handleResponse(request, charset, httpResponse, task);
|
return handleResponse(request, charset, httpResponse, task);
|
||||||
} else {
|
} else {
|
||||||
logger.warn("code error " + statusCode + "\t" + request.getUrl());
|
logger.warn("code error " + statusCode + "\t" + request.getUrl());
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* @since 0.3.0
|
||||||
|
*/
|
||||||
|
public abstract class BaseElementSelector implements Selector,ElementSelector {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String select(String text) {
|
||||||
|
return select(Jsoup.parse(text));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> selectList(String text) {
|
||||||
|
return selectList(Jsoup.parse(text));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,8 +1,6 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.jsoup.select.Elements;
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
|
@ -15,7 +13,7 @@ import java.util.List;
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public class CssSelector implements Selector {
|
public class CssSelector extends BaseElementSelector {
|
||||||
|
|
||||||
private String selectorText;
|
private String selectorText;
|
||||||
|
|
||||||
|
@ -30,16 +28,6 @@ public class CssSelector implements Selector {
|
||||||
this.attrName = attrName;
|
this.attrName = attrName;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public String select(String text) {
|
|
||||||
Document doc = Jsoup.parse(text);
|
|
||||||
Elements elements = doc.select(selectorText);
|
|
||||||
if (CollectionUtils.isEmpty(elements)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return getValue(elements.get(0));
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getValue(Element element) {
|
private String getValue(Element element) {
|
||||||
if (attrName == null) {
|
if (attrName == null) {
|
||||||
return element.outerHtml();
|
return element.outerHtml();
|
||||||
|
@ -51,9 +39,17 @@ public class CssSelector implements Selector {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(String text) {
|
public String select(Element element) {
|
||||||
|
Elements elements = element.select(selectorText);
|
||||||
|
if (CollectionUtils.isEmpty(elements)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return getValue(elements.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> selectList(Element doc) {
|
||||||
List<String> strings = new ArrayList<String>();
|
List<String> strings = new ArrayList<String>();
|
||||||
Document doc = Jsoup.parse(text);
|
|
||||||
Elements elements = doc.select(selectorText);
|
Elements elements = doc.select(selectorText);
|
||||||
if (CollectionUtils.isNotEmpty(elements)) {
|
if (CollectionUtils.isNotEmpty(elements)) {
|
||||||
for (Element element : elements) {
|
for (Element element : elements) {
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selector(extractor) for html elements.<br>
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.3.0
|
||||||
|
*/
|
||||||
|
public interface ElementSelector {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract single result in text.<br>
|
||||||
|
* If there are more than one result, only the first will be chosen.
|
||||||
|
*
|
||||||
|
* @param element
|
||||||
|
* @return result
|
||||||
|
*/
|
||||||
|
public String select(Element element);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract all results in text.<br>
|
||||||
|
*
|
||||||
|
* @param element
|
||||||
|
* @return results
|
||||||
|
*/
|
||||||
|
public List<String> selectList(Element element);
|
||||||
|
|
||||||
|
}
|
|
@ -1,5 +1,10 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import us.codecraft.webmagic.utils.EnvironmentUtil;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -11,12 +16,29 @@ import java.util.List;
|
||||||
*/
|
*/
|
||||||
public class Html extends PlainText {
|
public class Html extends PlainText {
|
||||||
|
|
||||||
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store parsed document for better performance when only one text exist.
|
||||||
|
*/
|
||||||
|
private Document document;
|
||||||
|
|
||||||
public Html(List<String> strings) {
|
public Html(List<String> strings) {
|
||||||
super(strings);
|
super(strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Html(String text) {
|
public Html(String text) {
|
||||||
super(text);
|
super(text);
|
||||||
|
try {
|
||||||
|
this.document = Jsoup.parse(text);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("parse document error ", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Html(Document document) {
|
||||||
|
super(document.html());
|
||||||
|
this.document = document;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Html create(String text) {
|
public static Html create(String text) {
|
||||||
|
@ -47,32 +69,77 @@ public class Html extends PlainText {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable smartContent() {
|
public Selectable smartContent() {
|
||||||
SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector();
|
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||||
return select(smartContentSelector, strings);
|
return select(smartContentSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable links() {
|
public Selectable links() {
|
||||||
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href");
|
return xpath("//a/@href");
|
||||||
return selectList(xpathSelector, strings);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable xpath(String xpath) {
|
public Selectable xpath(String xpath) {
|
||||||
XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath);
|
if (EnvironmentUtil.useXsoup()) {
|
||||||
return selectList(xpathSelector, strings);
|
XsoupSelector xsoupSelector = new XsoupSelector(xpath);
|
||||||
|
if (document != null) {
|
||||||
|
return new Html(xsoupSelector.selectList(document));
|
||||||
|
}
|
||||||
|
return selectList(xsoupSelector, strings);
|
||||||
|
} else {
|
||||||
|
XpathSelector xpathSelector = new XpathSelector(xpath);
|
||||||
|
return selectList(xpathSelector, strings);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector) {
|
public Selectable $(String selector) {
|
||||||
CssSelector cssSelector = new CssSelector(selector);
|
CssSelector cssSelector = Selectors.$(selector);
|
||||||
|
if (document != null) {
|
||||||
|
return new Html(cssSelector.selectList(document));
|
||||||
|
}
|
||||||
return selectList(cssSelector, strings);
|
return selectList(cssSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector, String attrName) {
|
public Selectable $(String selector, String attrName) {
|
||||||
CssSelector cssSelector = new CssSelector(selector, attrName);
|
CssSelector cssSelector = Selectors.$(selector, attrName);
|
||||||
|
if (document != null) {
|
||||||
|
return new Html(cssSelector.selectList(document));
|
||||||
|
}
|
||||||
return selectList(cssSelector, strings);
|
return selectList(cssSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Document getDocument() {
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getText() {
|
||||||
|
if (strings != null && strings.size() > 0) {
|
||||||
|
return strings.get(0);
|
||||||
|
}
|
||||||
|
return document.html();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param selector
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public String selectDocument(Selector selector) {
|
||||||
|
if (selector instanceof ElementSelector) {
|
||||||
|
ElementSelector elementSelector = (ElementSelector) selector;
|
||||||
|
return elementSelector.select(getDocument());
|
||||||
|
} else {
|
||||||
|
return selector.select(getText());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> selectDocumentForList(Selector selector) {
|
||||||
|
if (selector instanceof ElementSelector) {
|
||||||
|
ElementSelector elementSelector = (ElementSelector) selector;
|
||||||
|
return elementSelector.selectList(getDocument());
|
||||||
|
} else {
|
||||||
|
return selector.selectList(getText());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,13 +57,13 @@ public class PlainText implements Selectable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable regex(String regex) {
|
public Selectable regex(String regex) {
|
||||||
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex);
|
RegexSelector regexSelector = Selectors.regex(regex);
|
||||||
return selectList(regexSelector, strings);
|
return selectList(regexSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable regex(String regex, int group) {
|
public Selectable regex(String regex, int group) {
|
||||||
RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex, group);
|
RegexSelector regexSelector = Selectors.regex(regex, group);
|
||||||
return selectList(regexSelector, strings);
|
return selectList(regexSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ public class PlainText implements Selectable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable replace(String regex, String replacement) {
|
public Selectable replace(String regex, String replacement) {
|
||||||
ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement);
|
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
|
||||||
return select(replaceSelector, strings);
|
return select(replaceSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,4 +106,9 @@ public class PlainText implements Selectable {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean match() {
|
||||||
|
return strings != null && strings.size() > 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,6 +82,13 @@ public interface Selectable {
|
||||||
*/
|
*/
|
||||||
public String toString();
|
public String toString();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* if result exist for select
|
||||||
|
*
|
||||||
|
* @return true if result exist
|
||||||
|
*/
|
||||||
|
public boolean match();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* multi string result
|
* multi string result
|
||||||
*
|
*
|
||||||
|
|
|
@ -9,11 +9,15 @@ package us.codecraft.webmagic.selector;
|
||||||
public abstract class Selectors {
|
public abstract class Selectors {
|
||||||
|
|
||||||
public static RegexSelector regex(String expr) {
|
public static RegexSelector regex(String expr) {
|
||||||
return SelectorFactory.getInstatnce().newRegexSelector(expr);
|
return new RegexSelector(expr);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static RegexSelector regex(String expr, int group) {
|
public static RegexSelector regex(String expr, int group) {
|
||||||
return SelectorFactory.getInstatnce().newRegexSelector(expr, group);
|
return new RegexSelector(expr,group);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SmartContentSelector smartContent() {
|
||||||
|
return new SmartContentSelector();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static CssSelector $(String expr) {
|
public static CssSelector $(String expr) {
|
||||||
|
@ -25,7 +29,11 @@ public abstract class Selectors {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static XpathSelector xpath(String expr) {
|
public static XpathSelector xpath(String expr) {
|
||||||
return SelectorFactory.getInstatnce().newXpathSelector(expr);
|
return new XpathSelector(expr);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static XsoupSelector xsoup(String expr) {
|
||||||
|
return new XsoupSelector(expr);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static AndSelector and(Selector... selectors) {
|
public static AndSelector and(Selector... selectors) {
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import us.codecraft.xsoup.XPathEvaluator;
|
||||||
|
import us.codecraft.xsoup.Xsoup;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XPath selector based on Xsoup.<br>
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.3.0
|
||||||
|
*/
|
||||||
|
public class XsoupSelector extends BaseElementSelector {
|
||||||
|
|
||||||
|
private XPathEvaluator xPathEvaluator;
|
||||||
|
|
||||||
|
public XsoupSelector(String xpathStr) {
|
||||||
|
this.xPathEvaluator = Xsoup.compile(xpathStr);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String select(Element element) {
|
||||||
|
return xPathEvaluator.evaluate(element).get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> selectList(Element element) {
|
||||||
|
return xPathEvaluator.evaluate(element).list();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.BooleanUtils;
|
||||||
|
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* @since 0.3.0
|
||||||
|
*/
|
||||||
|
public abstract class EnvironmentUtil {
|
||||||
|
|
||||||
|
private static final String USE_XSOUP = "xsoup";
|
||||||
|
|
||||||
|
public static boolean useXsoup() {
|
||||||
|
Properties properties = System.getProperties();
|
||||||
|
Object o = properties.get(USE_XSOUP);
|
||||||
|
if (o == null) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return BooleanUtils.toBoolean(((String) o).toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void setUseXsoup(boolean useXsoup) {
|
||||||
|
Properties properties = System.getProperties();
|
||||||
|
properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@ -98,15 +99,17 @@ public class UrlUtils {
|
||||||
return stringBuilder.toString();
|
return stringBuilder.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern patternForCharset = Pattern.compile("charset=([^\\s;]*)");
|
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)");
|
||||||
|
|
||||||
public static String getCharset(String contentType) {
|
public static String getCharset(String contentType) {
|
||||||
Matcher matcher = patternForCharset.matcher(contentType);
|
Matcher matcher = patternForCharset.matcher(contentType);
|
||||||
if (matcher.find()) {
|
if (matcher.find()) {
|
||||||
return matcher.group(1);
|
String charset = matcher.group(1);
|
||||||
} else {
|
if (Charset.isSupported(charset)) {
|
||||||
return null;
|
return charset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import static junit.framework.Assert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class EnvironmentUtilTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
assertTrue(EnvironmentUtil.useXsoup());
|
||||||
|
EnvironmentUtil.setUseXsoup(false);
|
||||||
|
assertFalse(EnvironmentUtil.useXsoup());
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.2.1</version>
|
<version>0.3.1-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -34,7 +34,7 @@ class PageModelExtractor {
|
||||||
|
|
||||||
private List<FieldExtractor> fieldExtractors;
|
private List<FieldExtractor> fieldExtractors;
|
||||||
|
|
||||||
private Extractor extractor;
|
private Extractor objectExtractor;
|
||||||
|
|
||||||
public static PageModelExtractor create(Class clazz) {
|
public static PageModelExtractor create(Class clazz) {
|
||||||
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
PageModelExtractor pageModelExtractor = new PageModelExtractor();
|
||||||
|
@ -169,7 +169,7 @@ class PageModelExtractor {
|
||||||
annotation = clazz.getAnnotation(ExtractBy.class);
|
annotation = clazz.getAnnotation(ExtractBy.class);
|
||||||
if (annotation != null) {
|
if (annotation != null) {
|
||||||
ExtractBy extractBy = (ExtractBy) annotation;
|
ExtractBy extractBy = (ExtractBy) annotation;
|
||||||
extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -183,28 +183,28 @@ class PageModelExtractor {
|
||||||
if (!matched) {
|
if (!matched) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
if (extractor == null) {
|
if (objectExtractor == null) {
|
||||||
return processSingle(page, page.getHtml().toString());
|
return processSingle(page, null, false);
|
||||||
} else {
|
} else {
|
||||||
if (extractor.multi) {
|
if (objectExtractor.multi) {
|
||||||
List<Object> os = new ArrayList<Object>();
|
List<Object> os = new ArrayList<Object>();
|
||||||
List<String> list = extractor.getSelector().selectList(page.getHtml().toString());
|
List<String> list = objectExtractor.getSelector().selectList(page.getHtml().toString());
|
||||||
for (String s : list) {
|
for (String s : list) {
|
||||||
Object o = processSingle(page, s);
|
Object o = processSingle(page, s, false);
|
||||||
if (o != null) {
|
if (o != null) {
|
||||||
os.add(o);
|
os.add(o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
} else {
|
} else {
|
||||||
String select = extractor.getSelector().select(page.getHtml().toString());
|
String select = objectExtractor.getSelector().select(page.getHtml().toString());
|
||||||
Object o = processSingle(page, select);
|
Object o = processSingle(page, select, false);
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object processSingle(Page page, String html) {
|
private Object processSingle(Page page, String html, boolean isRaw) {
|
||||||
Object o = null;
|
Object o = null;
|
||||||
try {
|
try {
|
||||||
o = clazz.newInstance();
|
o = clazz.newInstance();
|
||||||
|
@ -213,10 +213,14 @@ class PageModelExtractor {
|
||||||
List<String> value;
|
List<String> value;
|
||||||
switch (fieldExtractor.getSource()) {
|
switch (fieldExtractor.getSource()) {
|
||||||
case RawHtml:
|
case RawHtml:
|
||||||
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
|
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||||
break;
|
break;
|
||||||
case Html:
|
case Html:
|
||||||
value = fieldExtractor.getSelector().selectList(html);
|
if (isRaw) {
|
||||||
|
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
|
||||||
|
} else {
|
||||||
|
value = fieldExtractor.getSelector().selectList(html);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case Url:
|
case Url:
|
||||||
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
|
||||||
|
@ -232,10 +236,14 @@ class PageModelExtractor {
|
||||||
String value;
|
String value;
|
||||||
switch (fieldExtractor.getSource()) {
|
switch (fieldExtractor.getSource()) {
|
||||||
case RawHtml:
|
case RawHtml:
|
||||||
value = fieldExtractor.getSelector().select(page.getHtml().toString());
|
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||||
break;
|
break;
|
||||||
case Html:
|
case Html:
|
||||||
value = fieldExtractor.getSelector().select(html);
|
if (isRaw) {
|
||||||
|
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
|
||||||
|
} else {
|
||||||
|
value = fieldExtractor.getSelector().select(html);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case Url:
|
case Url:
|
||||||
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
value = fieldExtractor.getSelector().select(page.getUrl().toString());
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
import org.apache.commons.lang3.builder.ToStringBuilder;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.model.HasKey;
|
||||||
|
import us.codecraft.webmagic.model.PageModelPipeline;
|
||||||
|
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||||
|
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store results objects (page models) to files in plain format.<br>
|
||||||
|
* Use model.getKey() as file name if the model implements HasKey.<br>
|
||||||
|
* Otherwise use SHA1 as file name.
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.3.0
|
||||||
|
*/
|
||||||
|
public class FilePageModelPipeline extends FilePersistentBase implements PageModelPipeline {
|
||||||
|
|
||||||
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* new JsonFilePageModelPipeline with default path "/data/webmagic/"
|
||||||
|
*/
|
||||||
|
public FilePageModelPipeline() {
|
||||||
|
setPath("/data/webmagic/");
|
||||||
|
}
|
||||||
|
|
||||||
|
public FilePageModelPipeline(String path) {
|
||||||
|
setPath(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Object o, Task task) {
|
||||||
|
String path = this.path + "/" + task.getUUID() + "/";
|
||||||
|
try {
|
||||||
|
String filename;
|
||||||
|
if (o instanceof HasKey) {
|
||||||
|
filename = path + ((HasKey) o).key() + ".html";
|
||||||
|
} else {
|
||||||
|
filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".html";
|
||||||
|
}
|
||||||
|
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename)));
|
||||||
|
printWriter.write(ToStringBuilder.reflectionToString(o));
|
||||||
|
printWriter.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.warn("write file error", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler {
|
||||||
public synchronized void push(Request request, Task task) {
|
public synchronized void push(Request request, Task task) {
|
||||||
Jedis jedis = pool.getResource();
|
Jedis jedis = pool.getResource();
|
||||||
try {
|
try {
|
||||||
//使用Set进行url去重
|
// if cycleRetriedTimes is set, allow duplicated.
|
||||||
if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
|
Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES);
|
||||||
//使用List保存队列
|
// use set to remove duplicate url
|
||||||
|
if (cycleRetriedTimes != null || !jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
|
||||||
|
// use list to store queue
|
||||||
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
|
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
|
||||||
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
|
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
|
||||||
if (request.getExtras() != null) {
|
if (request.getExtras() != null) {
|
||||||
|
|
|
@ -1,16 +1,14 @@
|
||||||
package us.codecraft.webmagic.utils;
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
import us.codecraft.webmagic.selector.CssSelector;
|
import us.codecraft.webmagic.selector.*;
|
||||||
import us.codecraft.webmagic.selector.RegexSelector;
|
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
|
||||||
import us.codecraft.webmagic.selector.XpathSelector;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tools for annotation converting. <br>
|
* Tools for annotation converting. <br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.2.1
|
* @since 0.2.1
|
||||||
*/
|
*/
|
||||||
|
@ -27,17 +25,27 @@ public class ExtractorUtils {
|
||||||
selector = new RegexSelector(value);
|
selector = new RegexSelector(value);
|
||||||
break;
|
break;
|
||||||
case XPath:
|
case XPath:
|
||||||
selector = new XpathSelector(value);
|
selector = getXpathSelector(value);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
selector = new XpathSelector(value);
|
selector = getXpathSelector(value);
|
||||||
|
}
|
||||||
|
return selector;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Selector getXpathSelector(String value) {
|
||||||
|
Selector selector;
|
||||||
|
if (EnvironmentUtil.useXsoup()) {
|
||||||
|
selector = new XsoupSelector(value);
|
||||||
|
} else {
|
||||||
|
selector = new XpathSelector(value);
|
||||||
}
|
}
|
||||||
return selector;
|
return selector;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Selector> getSelectors(ExtractBy[] extractBies) {
|
public static List<Selector> getSelectors(ExtractBy[] extractBies) {
|
||||||
List<Selector> selectors = new ArrayList<Selector>();
|
List<Selector> selectors = new ArrayList<Selector>();
|
||||||
if (extractBies==null){
|
if (extractBies == null) {
|
||||||
return selectors;
|
return selectors;
|
||||||
}
|
}
|
||||||
for (ExtractBy extractBy : extractBies) {
|
for (ExtractBy extractBy : extractBies) {
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.2.1</version>
|
<version>0.3.1-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,37 @@
|
||||||
|
package us.codecraft.webmagic.model.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.model.AfterExtractor;
|
||||||
|
import us.codecraft.webmagic.model.OOSpider;
|
||||||
|
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
|
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author yihua.huang@dianping.com <br>
|
||||||
|
* Date: 13-8-13 <br>
|
||||||
|
* Time: 上午10:13 <br>
|
||||||
|
*/
|
||||||
|
@TargetUrl("http://*.alpha.dp/*")
|
||||||
|
public class DianpingFtlDataScanner implements AfterExtractor {
|
||||||
|
|
||||||
|
@ExtractBy(value = "(DP\\.data\\(\\{.*\\}\\));", type = ExtractBy.Type.Regex, notNull = true, multi = true)
|
||||||
|
private List<String> data;
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
OOSpider.create(Site.me().addStartUrl("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class)
|
||||||
|
.thread(5).run();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void afterProcess(Page page) {
|
||||||
|
if (data.size() > 1) {
|
||||||
|
System.err.println(page.getUrl());
|
||||||
|
}
|
||||||
|
if (data.size() > 0 && data.get(0).length() > 100) {
|
||||||
|
System.err.println(page.getUrl());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
|
||||||
|
@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
if (page.getUrl().toString().contains("thread")){
|
if (page.getUrl().toString().contains("thread")){
|
||||||
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
|
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
|
||||||
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody"));
|
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
|
||||||
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
|
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
|
||||||
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
|
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
|
||||||
}
|
}
|
||||||
|
@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new DiaoyuwengProcessor()).run();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,9 @@ package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -15,14 +17,18 @@ public class F58PageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all();
|
List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
|
||||||
page.addTargetRequests(strings);
|
page.addTargetRequests(strings);
|
||||||
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
|
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
|
||||||
page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
|
page.putField("body",page.getHtml().xpath("//dd"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates.
|
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -14,10 +15,9 @@ import java.util.List;
|
||||||
public class HuxiuProcessor implements PageProcessor {
|
public class HuxiuProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
|
List<String> requests = page.getHtml().links().regex(".*article.*").all();
|
||||||
List<String> requests = page.getHtml().regex("<a[^<>\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all();
|
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']"));
|
page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
|
||||||
page.putField("content",page.getHtml().smartContent());
|
page.putField("content",page.getHtml().smartContent());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,4 +26,8 @@ public class HuxiuProcessor implements PageProcessor {
|
||||||
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
|
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new HuxiuProcessor()).run();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor {
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new InfoQMiniBookProcessor())
|
Spider.create(new InfoQMiniBookProcessor())
|
||||||
.scheduler(new RedisScheduler("localhost"))
|
|
||||||
.pipeline(new FilePipeline("/data/temp/webmagic/"))
|
|
||||||
.thread(5)
|
.thread(5)
|
||||||
.run();
|
.run();
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run();
|
Spider.create(new IteyeBlogProcessor()).thread(5).run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor {
|
||||||
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
|
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new KaichibaProcessor()).run();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor {
|
||||||
}
|
}
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
|
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
|
||||||
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]"));
|
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
|
||||||
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]"));
|
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor {
|
||||||
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
|
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new MeicanProcessor()).run();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
package us.codecraft.webmagic.samples;
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
||||||
page.addTargetRequests(links);
|
page.addTargetRequests(links);
|
||||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
|
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
|
||||||
page.putField("content", page.getHtml().$("div.content").toString());
|
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
|
||||||
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
|
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run();
|
Spider.create(new OschinaBlogPageProcesser()).run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
package us.codecraft.webmagic.samples.scheduler;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.scheduler.PriorityScheduler;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.DelayQueue;
|
||||||
|
import java.util.concurrent.Delayed;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class DelayQueueScheduler extends PriorityScheduler {
|
||||||
|
|
||||||
|
private DelayQueue<RequestWrapper> queue = new DelayQueue<RequestWrapper>();
|
||||||
|
|
||||||
|
private Set<String> urls = new HashSet<String>();
|
||||||
|
|
||||||
|
private long time;
|
||||||
|
|
||||||
|
private TimeUnit timeUnit;
|
||||||
|
|
||||||
|
private class RequestWrapper implements Delayed {
|
||||||
|
|
||||||
|
private long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
|
private Request request;
|
||||||
|
|
||||||
|
private RequestWrapper(Request request) {
|
||||||
|
this.request = request;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long getStartTime() {
|
||||||
|
return startTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Request getRequest() {
|
||||||
|
return request;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getDelay(TimeUnit unit) {
|
||||||
|
long convert = unit.convert(TimeUnit.MILLISECONDS.convert(time, timeUnit) - System.currentTimeMillis() + startTime, TimeUnit.MILLISECONDS);
|
||||||
|
return convert;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(Delayed o) {
|
||||||
|
return new Long(getDelay(TimeUnit.MILLISECONDS)).compareTo(o.getDelay(TimeUnit.MILLISECONDS));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public DelayQueueScheduler(long time, TimeUnit timeUnit) {
|
||||||
|
this.time = time;
|
||||||
|
this.timeUnit = timeUnit;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void push(Request request, Task task) {
|
||||||
|
if (urls.add(request.getUrl())) {
|
||||||
|
queue.add(new RequestWrapper(request));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized Request poll(Task task) {
|
||||||
|
RequestWrapper take = null;
|
||||||
|
while (take == null) {
|
||||||
|
try {
|
||||||
|
take = queue.take();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
queue.add(new RequestWrapper(take.getRequest()));
|
||||||
|
return take.getRequest();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,24 @@
|
||||||
|
package us.codecraft.webmagic.samples.scheduler;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.scheduler.PriorityScheduler;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class LevelLimitScheduler extends PriorityScheduler {
|
||||||
|
|
||||||
|
private int levelLimit = 3;
|
||||||
|
|
||||||
|
public LevelLimitScheduler(int levelLimit) {
|
||||||
|
this.levelLimit = levelLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void push(Request request, Task task) {
|
||||||
|
if (((Integer) request.getExtra("_level")) <= levelLimit) {
|
||||||
|
super.push(request, task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,89 @@
|
||||||
|
package us.codecraft.webmagic.samples.scheduler;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.scheduler.PriorityScheduler;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static us.codecraft.webmagic.selector.Selectors.regex;
|
||||||
|
import static us.codecraft.webmagic.selector.Selectors.xpath;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class ZipCodePageProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
private Site site = Site.me().setCharset("gb2312")
|
||||||
|
.setSleepTime(100).addStartUrl("http://www.ip138.com/post/");
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
if (page.getUrl().toString().equals("http://www.ip138.com/post/")) {
|
||||||
|
processCountry(page);
|
||||||
|
} else if (page.getUrl().regex("http://www\\.ip138\\.com/post/\\w+[/]?$").toString() != null) {
|
||||||
|
processProvince(page);
|
||||||
|
} else {
|
||||||
|
processDistrict(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processCountry(Page page) {
|
||||||
|
List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all();
|
||||||
|
for (String province : provinces) {
|
||||||
|
String link = xpath("//@href").select(province);
|
||||||
|
String title = xpath("/text()").select(province);
|
||||||
|
Request request = new Request(link).setPriority(0).putExtra("province", title);
|
||||||
|
page.addTargetRequest(request);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processProvince(Page page) {
|
||||||
|
//这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉
|
||||||
|
List<String> districts = page.getHtml().xpath("//body/table/tbody/tr/td").regex(".*http://www\\.ip138\\.com/post/\\w+/\\w+.*").all();
|
||||||
|
for (String district : districts) {
|
||||||
|
String link = xpath("//@href").select(district);
|
||||||
|
String title = xpath("/text()").select(district);
|
||||||
|
Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
|
||||||
|
page.addTargetRequest(request);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processDistrict(Page page) {
|
||||||
|
String province = page.getRequest().getExtra("province").toString();
|
||||||
|
String district = page.getRequest().getExtra("district").toString();
|
||||||
|
List<String> counties = page.getHtml().xpath("//body/table/tbody/tr").regex(".*<td>\\d+</td>.*").all();
|
||||||
|
String regex = "<td[^<>]*>([^<>]+)</td><td[^<>]*>([^<>]+)</td><td[^<>]*>([^<>]+)</td><td[^<>]*>([^<>]+)</td>";
|
||||||
|
for (String county : counties) {
|
||||||
|
String county0 = regex(regex, 1).select(county);
|
||||||
|
String county1 = regex(regex, 2).select(county);
|
||||||
|
String zipCode = regex(regex, 3).select(county);
|
||||||
|
page.putField("result", StringUtils.join(new String[]{province, district,
|
||||||
|
county0, county1, zipCode}, "\t"));
|
||||||
|
}
|
||||||
|
List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/post/\\w+/\\w+").all();
|
||||||
|
for (String link : links) {
|
||||||
|
page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return site;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).run();
|
||||||
|
|
||||||
|
PriorityScheduler scheduler = new PriorityScheduler();
|
||||||
|
Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(scheduler);
|
||||||
|
scheduler.push(new Request("http://www.baidu.com/s?wd=webmagic&f=12&rsp=0&oq=webmagix&tn=baiduhome_pg&ie=utf-8"),spider);
|
||||||
|
spider.run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,891 @@
|
||||||
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
|
import org.junit.Ignore;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.model.samples.OschinaBlog;
|
||||||
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class ProcessorBenchmark {
|
||||||
|
|
||||||
|
@Ignore
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class);
|
||||||
|
Page page = new Page();
|
||||||
|
page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
|
||||||
|
page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
|
||||||
|
page.setHtml(new Html(html));
|
||||||
|
long time = System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
modelPageProcessor.process(page);
|
||||||
|
}
|
||||||
|
System.out.println(System.currentTimeMillis() - time);
|
||||||
|
time = System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
modelPageProcessor.process(page);
|
||||||
|
}
|
||||||
|
System.out.println(System.currentTimeMillis() - time);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String html = "\n" +
|
||||||
|
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" +
|
||||||
|
"<html lang='zh-CN' xml:lang='zh-CN' xmlns='http://www.w3.org/1999/xhtml'>\n" +
|
||||||
|
"<head>\n" +
|
||||||
|
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>\n" +
|
||||||
|
" <meta http-equiv=\"Content-Language\" content=\"zh-CN\"/>\n" +
|
||||||
|
" <meta name=\"robots\" content=\"index, follow\" />\n" +
|
||||||
|
" <link rel=\"shortcut icon\" type=\"image/x-icon\" href=\"/img/favicon.ico\" />\n" +
|
||||||
|
" <title>Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区</title>\n" +
|
||||||
|
" <meta name=\"Keywords\" content=\"Jsoup,XSS,OO\"/>\n" +
|
||||||
|
" <meta name=\"Description\" content=\"Jsoup代码解读之八-防御XSS攻击:![hacker][1] ## 防御XSS攻击的一般原理 cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的...\"/>\n" +
|
||||||
|
" <link rel=\"stylesheet/less\" href=\"http://my.oschina.net/flashsword/styles.less?ver=20130608&date=20130524070359\" type=\"text/css\" media=\"screen\" />\n" +
|
||||||
|
" <link rel=\"stylesheet\" href=\"/js/2012/poshytip/tip-yellowsimple/tip-yellowsimple.css\" type=\"text/css\" />\n" +
|
||||||
|
" <link rel=\"stylesheet\" type=\"text/css\" href=\"/js/2011/fancybox/jquery.fancybox-1.3.4.css\" media=\"screen\" />\n" +
|
||||||
|
" <script type=\"text/javascript\" src=\"/js/2012/jquery-1.7.1.min.js\"></script>\n" +
|
||||||
|
" <script type=\"text/javascript\" src=\"/js/2012/jquery.form.js\"></script>\n" +
|
||||||
|
" <script type=\"text/javascript\" src=\"/js/2011/fancybox/jquery.fancybox-1.3.4.pack.js\"></script>\n" +
|
||||||
|
" <script type=\"text/javascript\" src=\"/js/2012/poshytip/jquery.poshytip.min.js\"></script>\n" +
|
||||||
|
" <script type=\"text/javascript\" src=\"/js/2011/oschina.js?ver=20121007\"></script>\n" +
|
||||||
|
" <script type=\"text/javascript\" src=\"/js/2012/less-1.3.0.min.js\"></script>\n" +
|
||||||
|
" <script type=\"text/javascript\" src=\"/js/scrolltopcontrol.js\"></script>\n" +
|
||||||
|
" <script type='text/javascript' src='/js/jquery/jquery.atwho.js'></script>\n" +
|
||||||
|
" <link rel=\"stylesheet\" type=\"text/css\" href=\"/js/jquery/jquery.atwho.css\" />\n" +
|
||||||
|
" <link rel=\"alternate\" type=\"application/rss+xml\" title=\"黄亿华最新博客\" href=\"http://my.oschina.net/flashsword/rss\" />\n" +
|
||||||
|
" <link rel=\"EditURI\" type=\"application/rsd+xml\" title=\"RSD\" href=\"http://my.oschina.net/action/xmlrpc/rsd?space=190591\" />\n" +
|
||||||
|
" <link rel=\"wlwmanifest\" type=\"application/wlwmanifest+xml\" href=\"http://my.oschina.net/action/xmlrpc/wlwmanifest?space=190591\" /> \n" +
|
||||||
|
" <style type=\"text/css\">\n" +
|
||||||
|
" body,table,input,textarea,select {font-family:Verdana,sans-serif,宋体;}\t\n" +
|
||||||
|
" </style>\n" +
|
||||||
|
" <script type=\"text/javascript\">\n" +
|
||||||
|
" \tscrolltotop.offset(100,165);\n" +
|
||||||
|
"\tscrolltotop.init();\n" +
|
||||||
|
" </script>\n" +
|
||||||
|
"</head>\n" +
|
||||||
|
"<body>\n" +
|
||||||
|
"<div id=\"OSC_Screen\">\n" +
|
||||||
|
"\t<div id='OSC_Banner'>\n" +
|
||||||
|
"\t\t<div id=\"OSC_Logo\">\n" +
|
||||||
|
" \t<a href=\"http://www.oschina.net/\" title=\"开源中国社区首页\">开源中国社区</a>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
" <div id='OSC_Slogon'>开源项目发现、使用和交流平台</div>\n" +
|
||||||
|
"\t\t <div id=\"OSC_Channels\">\n" +
|
||||||
|
" \t<ul>\n" +
|
||||||
|
" \t<li><a href=\"http://www.oschina.net/project\" class='software'>项目</a></li>\n" +
|
||||||
|
" \t<li><a href=\"http://www.oschina.net/question\" class='question'>讨论</a></li>\n" +
|
||||||
|
" \t<li><a href=\"http://www.oschina.net/code/list\" class='code'>代码</a></li>\n" +
|
||||||
|
" \t<li><a href=\"http://www.oschina.net/news\" class='news'>资讯</a></li>\n" +
|
||||||
|
" \t<li><a href=\"http://www.oschina.net/translate\" class='translate'>翻译</a></li>\n" +
|
||||||
|
" \t<li><a href=\"http://www.oschina.net/blog\" class='blog'>博客</a></li>\n" +
|
||||||
|
" \t<li><a href=\"http://www.oschina.net/android\" class='android'>Android</a></li>\n" +
|
||||||
|
" \t<li><a href=\"http://www.oschina.net/job\" class='job'>招聘</a></li>\n" +
|
||||||
|
" \t</ul>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
" <div class='clear'></div>\n" +
|
||||||
|
"\t</div>\n" +
|
||||||
|
"\t<div id=\"OSC_Topbar\">\n" +
|
||||||
|
"\t\t<div id=\"VisitorInfo\">\n" +
|
||||||
|
"\t\t当前访客身份:\n" +
|
||||||
|
"\t\t\t\t黄亿华 [ <a href=\"/action/user/logout?session=6db40e6e2d1061998068&goto_page=http%3A%2F%2Fmy.oschina.net%2Fflashsword\">退出</a> ]\n" +
|
||||||
|
"\t\t\t\t<span id=\"OSC_Notification\">\t\t\t\n" +
|
||||||
|
"\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<a href=\"http://my.oschina.net/flashsword/admin/inbox\" class=\"msgbox\" title=\"进入我的留言箱\">你有<em>0</em>新留言</a>\t\t\t\n" +
|
||||||
|
"\t\t\t\t\t\t\t\t\t\t\t</span>\n" +
|
||||||
|
"\t\t</div>\n" +
|
||||||
|
"\t\t<div id=\"SearchBar\">\n" +
|
||||||
|
" \t\t<form action=\"http://www.oschina.net/search\">\n" +
|
||||||
|
"\t\t\t\t<input type='hidden' name='user' value='190591'/>\n" +
|
||||||
|
"\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<span class=\"ipt f_l\">\n" +
|
||||||
|
" \t\t\t<input type='text' id='txt_q' name='q' class='SERACH' value='在 26755 款开源软件中搜索' onblur=\"(this.value=='')?this.value='在 26755 款开源软件中搜索':this.value\" onfocus=\"if(this.value=='在 26755 款开源软件中搜索'){this.value='';};this.select();\"/>\n" +
|
||||||
|
"\t\t\t\t</span>\n" +
|
||||||
|
"\t\t\t\t <div class=\"search-by selectbox\">\n" +
|
||||||
|
" \t\t\t\t<span class=\"hide\">\n" +
|
||||||
|
" \t\t\t\t<select name='scope'>\t\t\t\t\t\n" +
|
||||||
|
" <option value='project' selected>软件</option>\n" +
|
||||||
|
" <option value='code'>代码</option>\n" +
|
||||||
|
" <option value='bbs'>讨论区</option>\n" +
|
||||||
|
" <option value='news'>新闻</option>\n" +
|
||||||
|
" <option value='blog'>博客</option>\n" +
|
||||||
|
" \t\t\t\t</select>\n" +
|
||||||
|
" \t\t\t\t</span>\n" +
|
||||||
|
" <div class=\"search_on\" id=\"search-item\"><span class=\"text\">软件</span></div>\n" +
|
||||||
|
" <ul class=\"search_list\">\n" +
|
||||||
|
" <li class=\"search-item\"><a href=\"#1\">软件</a></li>\n" +
|
||||||
|
" <li><a href=\"#2\">代码</a></li>\n" +
|
||||||
|
" <li><a href=\"#3\">讨论区</a></li>\n" +
|
||||||
|
" <li><a href=\"#4\">新闻</a></li>\n" +
|
||||||
|
" <li><a href=\"#5\">博客</a></li>\n" +
|
||||||
|
" </ul>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
"\t\t\t\t<input type='submit' value='搜索' class='bnt f_r'/>\t\t\t\n" +
|
||||||
|
" \t\t</form>\n" +
|
||||||
|
"\t\t</div>\n" +
|
||||||
|
"\t\t<div class='clear'></div>\n" +
|
||||||
|
"\t</div>\n" +
|
||||||
|
"\t<div id=\"OSC_Content\">\t\n" +
|
||||||
|
"\n" +
|
||||||
|
"<div id='SpaceLeft'>\n" +
|
||||||
|
"<div class='Owner'>\n" +
|
||||||
|
"\t\t<a href='http://my.oschina.net/flashsword/admin/user-settings?tab=3' title='切换空间风格' class='ThemeSetting'>切换风格</a> <a href=\"http://my.oschina.net/flashsword\" class='Img'><img src=\"http://static.oschina.net/uploads/user/95/190591_100.jpg?t=1347254905000\" align=\"absmiddle\" alt=\"黄亿华\" title=\"黄亿华\" class=\"LargePortrait\"/></a>\n" +
|
||||||
|
" <span class='U'>\n" +
|
||||||
|
" <a href=\"http://my.oschina.net/flashsword\" class='Name' title='男'>黄亿华</a>\n" +
|
||||||
|
"\t\t<span class='opts'>\n" +
|
||||||
|
"\t\t\t<img src=\"/img/2012/men.png\" align='absmiddle' title='男'/>\n" +
|
||||||
|
" \t\t\t<a href=\"http://my.oschina.net/flashsword/admin/profile\">修改资料</a>\n" +
|
||||||
|
"\t\t\t<a href=\"http://my.oschina.net/flashsword/admin/portrait\">更换头像</a>\n" +
|
||||||
|
" \t\t</span>\n" +
|
||||||
|
" </span>\n" +
|
||||||
|
" <div class='clear'></div>\n" +
|
||||||
|
" <div class='stat'>\n" +
|
||||||
|
" \t<a href=\"http://my.oschina.net/flashsword/fellow\">关注(43)</a>\n" +
|
||||||
|
" \t<a href=\"http://my.oschina.net/flashsword/fans\">粉丝(98)</a>\n" +
|
||||||
|
" \t<a href=\"http://www.oschina.net/question/3307_20931\" title=\"查看OSCHINA积分规则\">积分(173)</a>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
"</div><style>\n" +
|
||||||
|
"#MyResume textarea {width:170px;height:60px;font-size:9pt;}\n" +
|
||||||
|
"</style>\n" +
|
||||||
|
"<div class='Resume' id='MyResume'>\n" +
|
||||||
|
"码农一枚<br/>实用主义者<br/>抵制重复造轮子,却造了不少轮子<br/>http://codecraft.us</div>\n" +
|
||||||
|
"<script type=\"text/javascript\" src=\"/js/2012/jquery.editinplace.js\"></script>\n" +
|
||||||
|
"<script type=\"text/javascript\">\n" +
|
||||||
|
"$(\"#MyResume\").editInPlace({\n" +
|
||||||
|
" url: \"/action/profile/update_user_signature?user_code=tzm9Wg2YoU8SkJaTIjHQkahStiXQNyymUGXFOQgN\",\n" +
|
||||||
|
"\tbg_over: \"none\",\n" +
|
||||||
|
"\tbg_out: \"none\",\n" +
|
||||||
|
" field_type: \"textarea\",\n" +
|
||||||
|
"\tvalue_required: \"true\",\n" +
|
||||||
|
"\terror: function(){\n" +
|
||||||
|
"\t\talert(\"修改个人简介失败\");\n" +
|
||||||
|
"\t}\n" +
|
||||||
|
"});\n" +
|
||||||
|
"</script>\n" +
|
||||||
|
"\n" +
|
||||||
|
"<div class='Opts clearfix'>\n" +
|
||||||
|
"\t<a href=\"http://my.oschina.net/flashsword/admin/new-blog\" class='a1 blog'><i>.</i><span>发表博文</span></a>\n" +
|
||||||
|
"\t<a href=\"http://my.oschina.net/flashsword/admin\" class='a2 admin'><i>.</i><span>空间管理</span></a>\n" +
|
||||||
|
"</div><div class=\"Mod\" id=\"BlogCatalogs\">\n" +
|
||||||
|
" <strong><a href=\"http://my.oschina.net/flashsword/admin/blog-catalogs\" class=\"more\">管理»</a> 博客分类</strong>\n" +
|
||||||
|
" <ul>\n" +
|
||||||
|
"\t\t\t<li class='draft'><a href=\"http://my.oschina.net/flashsword/admin/drafts\">草稿箱</a><span>(4)</span></li>\n" +
|
||||||
|
"\t \t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=371362\">webmagic</a><span>(16)</span></li>\n" +
|
||||||
|
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=380473\">分布式消息系统</a><span>(5)</span></li>\n" +
|
||||||
|
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=285504\">探耽求究</a><span>(5)</span></li>\n" +
|
||||||
|
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=368513\">BlackHoleJ</a><span>(21)</span></li>\n" +
|
||||||
|
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=368514\">Intellij</a><span>(4)</span></li>\n" +
|
||||||
|
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=112331\">工作日志</a><span>(7)</span></li>\n" +
|
||||||
|
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=112332\">日常记录</a><span>(4)</span></li>\n" +
|
||||||
|
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=261044\">codecraft</a><span>(1)</span></li>\n" +
|
||||||
|
"\t\t<li><a href=\"http://my.oschina.net/flashsword/blog?catalog=279271\">开发日记</a><span>(3)</span></li>\n" +
|
||||||
|
"\t </ul>\n" +
|
||||||
|
"</div><div class=\"Mod\" id=\"HotBlogs\">\n" +
|
||||||
|
" <strong>阅读排行</strong>\n" +
|
||||||
|
" <ol>\n" +
|
||||||
|
"\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/145796\">1. webmagic的设计机制及原理-如何开发一个Java爬虫</a></li>\n" +
|
||||||
|
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/143028\">2. monkeysocks开发日志--TCP协议分析及架构规划</a></li>\n" +
|
||||||
|
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/156638\">3. 【整理】国内一些大公司的开源项目</a></li>\n" +
|
||||||
|
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/110276\">4. BlackHole开发日志--防止DNS污染</a></li>\n" +
|
||||||
|
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/158200\">5. Jsoup代码解读之八-防御XSS攻击</a></li>\n" +
|
||||||
|
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/123505\">6. IntelliJ IDEA使用心得</a></li>\n" +
|
||||||
|
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/80037\">7. 关于HTTP keep-alive的实验</a></li>\n" +
|
||||||
|
"\t\t\t\t<li><a href=\"http://my.oschina.net/flashsword/blog/152263\">8. 分布式消息系统研究报告之Kafka</a></li>\n" +
|
||||||
|
"\t\t </ol>\n" +
|
||||||
|
"</div>\n" +
|
||||||
|
"<div class=\"Mod\" id=\"BlogReplies\">\n" +
|
||||||
|
" <strong><a href=\"http://my.oschina.net/flashsword/admin/blog-comments\" class=\"more\">管理»</a> 最新评论</strong> \n" +
|
||||||
|
" <ul>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275640366&type=18&user=190591\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/lidongyang\">@lidongyang</a>:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275640301&type=18&user=723383\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275638563&type=18&user=190591\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/lidongyang\">@lidongyang</a>:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275638070&type=18&user=723383\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“searchjack”的评论 不是好的就会被认...\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275617319&type=18&user=190591\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/searchjack\">@searchjack</a>:不是好的就会被认可, 干自己的, 到时候, 单干\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275617235&type=18&user=234880\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/searchjack\">@searchjack</a>:极好的工具,\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275616963&type=18&user=234880\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275599170&type=18&user=190591\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/rox\">@静风流云</a>:貌似,OSC也是类似处理的。\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275599137&type=18&user=180\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t<li>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/flashsword\">@黄亿华</a>:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n" +
|
||||||
|
"\t\t<a href=\"/action/tweet/go?obj=275570030&type=18&user=190591\">查看»</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t </ul>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
"<div class='Mod' id='Stat'>\n" +
|
||||||
|
"<strong>访客统计</strong>\n" +
|
||||||
|
"<ul>\n" +
|
||||||
|
"\t<li><label>今日访问:</label>6 (<a href=\"http://my.oschina.net/flashsword/visitors\">查看最新访客»</a>)</li>\n" +
|
||||||
|
" <li><label>昨日访问:</label>284</li>\n" +
|
||||||
|
" <li><label>本周访问:</label>817</li>\n" +
|
||||||
|
" <li><label>本月访问:</label>1888</li>\n" +
|
||||||
|
" <li><label>所有访问:</label>16453</li>\n" +
|
||||||
|
"</ul>\n" +
|
||||||
|
"</div></div>\n" +
|
||||||
|
"\n" +
|
||||||
|
"<div class='SpaceList'>\n" +
|
||||||
|
"\t<div class='TopBar'>\n" +
|
||||||
|
" \t<div class='NavPath'>\t\t\n" +
|
||||||
|
" \t\t<a href='http://my.oschina.net/flashsword'>空间</a> » <a href='http://my.oschina.net/flashsword/blog'>博客</a>\t\t\t\n" +
|
||||||
|
"\t\t\t» <a href=\"http://my.oschina.net/flashsword/blog?catalog=371362\">webmagic</a>\n" +
|
||||||
|
"\t\t\t» 博客正文\n" +
|
||||||
|
" \t</div>\n" +
|
||||||
|
"\t</div>\n" +
|
||||||
|
"\t\n" +
|
||||||
|
" \t<div class='BlogEntity'>\t\t\n" +
|
||||||
|
" <div class='BlogTitle'>\n" +
|
||||||
|
" <h1><img src='/img/space/b1.gif' align='absmiddle'/> Jsoup代码解读之八-防御XSS攻击</h1>\n" +
|
||||||
|
" <div class='BlogStat'>\n" +
|
||||||
|
" \t\t \t\t \t\t<span class='admin'>\n" +
|
||||||
|
" \t\t\t<a href=\"http://my.oschina.net/flashsword/admin/edit-blog?blog=158200\">编辑</a> | <a href=\"javascript:delete_blog(158200)\">删除</a>\n" +
|
||||||
|
" \t\t</span>\n" +
|
||||||
|
"\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n" +
|
||||||
|
" \t\t已有<strong>1628</strong>次阅读 ,共<strong><a href=\"#comments\">3</a></strong>个评论\n" +
|
||||||
|
" \t\t\t\t\t,共 <strong>79</strong> 人收藏此文 \t</div> \n" +
|
||||||
|
" </div>\n" +
|
||||||
|
"\t \t <div class=\"BlogAnchor\">\n" +
|
||||||
|
" <p>目录:[ <strong><a href=\"#\" id=\"AnchorContentToggle\" title=\"收起\">-</a></strong> ]</p>\n" +
|
||||||
|
" <div class=\"AnchorContent\" id=\"AnchorContent\"><li class='osc_h2'><a href='#OSC_h2_1'>防御XSS攻击的一般原理</a></li><li class='osc_h2'><a href='#OSC_h2_2'>Cleaner与Whitelist</a></li><li class='osc_h2'><a href='#OSC_h2_3'>结束语</a></li></div>\n" +
|
||||||
|
" \t </div>\n" +
|
||||||
|
" <script>\n" +
|
||||||
|
"\t\t \t$(function(){\n" +
|
||||||
|
"\t\t\t\t$(\"#AnchorContentToggle\").click(function(){\n" +
|
||||||
|
"\t\t\t\t\tvar text = $(this).html();\n" +
|
||||||
|
"\t\t\t\t\tif(text==\"-\"){\n" +
|
||||||
|
"\t\t\t\t\t\t$(this).html(\"+\");\n" +
|
||||||
|
"\t\t\t\t\t\t$(this).attr({\"title\":\"展开\"});\n" +
|
||||||
|
"\t\t\t\t\t}else{\n" +
|
||||||
|
"\t\t\t\t\t\t$(this).html(\"-\");\n" +
|
||||||
|
"\t\t\t\t\t\t$(this).attr({\"title\":\"收起\"});\n" +
|
||||||
|
"\t\t\t\t\t}\n" +
|
||||||
|
"\t\t\t\t\t$(\"#AnchorContent\").toggle();\n" +
|
||||||
|
"\t\t\t\t});\n" +
|
||||||
|
"\t\t\t});\n" +
|
||||||
|
"\t\t </script>\n" +
|
||||||
|
"\t \t <div class='BlogContent'><p><img src=\"http://static.oschina.net/uploads/space/2013/0831/071752_RBZc_190591.png\" /></p> \n" +
|
||||||
|
"<span id=\"OSC_h2_1\"></span>\n" +
|
||||||
|
"<h2>防御XSS攻击的一般原理</h2> \n" +
|
||||||
|
"<p>cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。</p> \n" +
|
||||||
|
"<p>我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<code><</code>,<code>></code>,<code>"</code>,<code>'</code>是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。</p> \n" +
|
||||||
|
"<p>在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:</p> \n" +
|
||||||
|
"<ol> \n" +
|
||||||
|
" <li><p>将HTML解析为DOM树</p> <p>这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用<code></textarea></code>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。</p></li> \n" +
|
||||||
|
" <li><p>过滤高风险标签/属性/属性值</p> <p>高风险标签是指<code><script></code>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如<code>onclick='alert("xss!")'</code>。</p></li> \n" +
|
||||||
|
" <li><p>重新将DOM树输出为HTML文本</p> <p>DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。</p></li> \n" +
|
||||||
|
"</ol> \n" +
|
||||||
|
"<span id=\"OSC_h2_2\"></span>\n" +
|
||||||
|
"<h2>Cleaner与Whitelist</h2> \n" +
|
||||||
|
"<p>对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。</p> \n" +
|
||||||
|
"<p>Jsoup给出的答案是白名单。下面是<code>Whitelist</code>的部分代码。</p> \n" +
|
||||||
|
"<pre class=\"brush: java\">public class Whitelist {\n" +
|
||||||
|
" private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n" +
|
||||||
|
" private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n" +
|
||||||
|
" private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n" +
|
||||||
|
" private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n" +
|
||||||
|
" private boolean preserveRelativeLinks; // option to preserve relative links\n" +
|
||||||
|
"}</pre> \n" +
|
||||||
|
"<p>这里定义了标签名/属性名/属性值的白名单。</p> \n" +
|
||||||
|
"<p>而<code>Cleaner</code>是过滤的执行者。不出所料,Cleaner内部定义了<code>CleaningVisitor</code>来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到<code>Element destination</code>里去。</p> \n" +
|
||||||
|
"<pre class=\"brush: java\">private final class CleaningVisitor implements NodeVisitor {\n" +
|
||||||
|
" private int numDiscarded = 0;\n" +
|
||||||
|
" private final Element root;\n" +
|
||||||
|
" private Element destination; // current element to append nodes to\n" +
|
||||||
|
"\n" +
|
||||||
|
" private CleaningVisitor(Element root, Element destination) {\n" +
|
||||||
|
" this.root = root;\n" +
|
||||||
|
" this.destination = destination;\n" +
|
||||||
|
" }\n" +
|
||||||
|
"\n" +
|
||||||
|
" public void head(Node source, int depth) {\n" +
|
||||||
|
" if (source instanceof Element) {\n" +
|
||||||
|
" Element sourceEl = (Element) source;\n" +
|
||||||
|
"\n" +
|
||||||
|
" if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n" +
|
||||||
|
" ElementMeta meta = createSafeElement(sourceEl);\n" +
|
||||||
|
" Element destChild = meta.el;\n" +
|
||||||
|
" destination.appendChild(destChild);\n" +
|
||||||
|
"\n" +
|
||||||
|
" numDiscarded += meta.numAttribsDiscarded;\n" +
|
||||||
|
" destination = destChild;\n" +
|
||||||
|
" } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n" +
|
||||||
|
" numDiscarded++;\n" +
|
||||||
|
" }\n" +
|
||||||
|
" } else if (source instanceof TextNode) {\n" +
|
||||||
|
" TextNode sourceText = (TextNode) source;\n" +
|
||||||
|
" TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n" +
|
||||||
|
" destination.appendChild(destText);\n" +
|
||||||
|
" } else { // else, we don't care about comments, xml proc instructions, etc\n" +
|
||||||
|
" numDiscarded++;\n" +
|
||||||
|
" }\n" +
|
||||||
|
" }\n" +
|
||||||
|
"\n" +
|
||||||
|
" public void tail(Node source, int depth) {\n" +
|
||||||
|
" if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n" +
|
||||||
|
" destination = destination.parent(); // would have descended, so pop destination stack\n" +
|
||||||
|
" }\n" +
|
||||||
|
" }\n" +
|
||||||
|
"}</pre> \n" +
|
||||||
|
"<span id=\"OSC_h2_3\"></span>\n" +
|
||||||
|
"<h2>结束语</h2> \n" +
|
||||||
|
"<p>至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:</p> \n" +
|
||||||
|
"<ul> \n" +
|
||||||
|
" <li><p>最好的代码抽象,是对现实概念的映射。</p> <p>这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。</p></li> \n" +
|
||||||
|
" <li><p>不要过度抽象</p> <p>在Jsoup里,只用到了两个接口,一个是<code>NodeVisitor</code>,一个是<code>Connection</code>,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。</p> <p>另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。</p></li> \n" +
|
||||||
|
"</ul> \n" +
|
||||||
|
"<p>最后继续贴上Jsoup解读系列的github地址:<a href=\"https://github.com/code4craft/jsoup-learning/\" rel=\"nofollow\">https://github.com/code4craft/jsoup-learning/</a></p></div>\n" +
|
||||||
|
" \t \t \n" +
|
||||||
|
" \t\n" +
|
||||||
|
"\t <div class='BlogTags'>\n" +
|
||||||
|
" \t<strong>关键字:</strong>\n" +
|
||||||
|
" \t \t<a href=\"http://www.oschina.net/search?scope=blog&q=Jsoup\" class=\"tag\">Jsoup</a>\n" +
|
||||||
|
" \t \t<a href=\"http://www.oschina.net/search?scope=blog&q=XSS\" class=\"tag\">XSS</a>\n" +
|
||||||
|
" \t \t<a href=\"http://www.oschina.net/search?scope=blog&q=OO\" class=\"tag\">OO</a>\n" +
|
||||||
|
" \t \t </div>\n" +
|
||||||
|
"\t \t \n" +
|
||||||
|
" <div class='BlogCopyright'>\t\t\n" +
|
||||||
|
"\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" +
|
||||||
|
"\t \t </div>\n" +
|
||||||
|
"\n" +
|
||||||
|
" <div class='BlogLinks'>\n" +
|
||||||
|
" \t<ul>\n" +
|
||||||
|
" <li class='prev'><a href=\"http://my.oschina.net/flashsword/blog/158171\" title=\"上一篇:Jsoup代码解读之七-实现一个CSS Selector\">« Jsoup代码解读之七-实现一个CSS Selector</a></li> \t</ul>\n" +
|
||||||
|
"\t\t </div>\n" +
|
||||||
|
"\t</div>\n" +
|
||||||
|
"\n" +
|
||||||
|
"\t<style type='text/css'>\n" +
|
||||||
|
"\t#BlogShare strong{float:left;padding-top:10px;font-size:11pt;color:#444;}\n" +
|
||||||
|
"\t#BlogShare a.share_sina{float:left;width:32px;height:32px;background:url('/img/icon01.gif') center no-repeat;}\n" +
|
||||||
|
"\t#BlogShare a.share_qq{float:left;width:32px;height:32px;margin-left: 10px;background:url('/img/icon02.gif') center no-repeat;}\n" +
|
||||||
|
"\t</style>\n" +
|
||||||
|
"\t<div class='BlogShare'>\n" +
|
||||||
|
"\t\n" +
|
||||||
|
"\t<span id='BlogShare'>\n" +
|
||||||
|
"\t\t<strong>分享到: </strong>\n" +
|
||||||
|
"\t\t<a class=\"share_sina\" title=\"分享到新浪微博\" href=\"javascript:void((function(s,d,e,r,l,p,t,z,c){var%20f='http://v.t.sina.com.cn/share/share.php?appkey=858381728',u=z||d.location,p=['&url=',e(u),'&title=',e(t||d.title),'&source=',e(r),'&sourceUrl=',e(l),'&content=',c||'gb2312','&pic=',e(p||'')].join('');function%20a(){if(!window.open([f,p].join(''),'mb',['toolbar=0,status=0,resizable=1,width=440,height=430,left=',(s.width-440)/2,',top=',(s.height-430)/2].join('')))u.href=[f,p].join('');};if(/Firefox/.test(navigator.userAgent))setTimeout(a,0);else%20a();})(screen,document,encodeURIComponent,'','','','Jsoup代码解读之八-防御XSS攻击: 防御XSS攻击的一般原理 cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。 我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,...','','utf-8'));\"></a>\n" +
|
||||||
|
"\t\t<a class=\"share_qq\" title=\"分享到腾讯微博\" href=\"javascript:(function(){window.open('http://v.t.qq.com/share/share.php?url='+encodeURIComponent(document.location)+'&appkey=96f54f97c4de46e393c4835a266207f4&site=&title='+encodeURIComponent(document.title)+encodeURIComponent(': 防御XSS攻击的一般原理 cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。 我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,...'),'', 'width=450, height=400, top=0, left=0, toolbar=no, menubar=no, scrollbars=no, location=yes, resizable=no, status=no');}())\"></a></span>\n" +
|
||||||
|
"\t<span id='BlogVote'>\n" +
|
||||||
|
" <a href=\"javascript:vote(158200)\">顶</a><span>已有 <em id='vote_count'>0</em>人顶</span>\n" +
|
||||||
|
"\t</span>\n" +
|
||||||
|
"\t</div>\n" +
|
||||||
|
"\t\t\n" +
|
||||||
|
"</div>\n" +
|
||||||
|
"<div class='SpaceList' style='margin-top:20px;'>\n" +
|
||||||
|
"<div class='BlogComments'>\n" +
|
||||||
|
" <h2><a name=\"comments\"></a>共有 3 条网友评论</h2>\n" +
|
||||||
|
"\t\t\t<ul id=\"BlogComments\">\n" +
|
||||||
|
"\t\t\t\t\t\t<li id='cmt_158200_180_275599137'>\n" +
|
||||||
|
"\t<table class='ostable'><tr>\n" +
|
||||||
|
"\t<td class='portrait'>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/rox\" target=\"_blank\"><img src=\"http://static.oschina.net/uploads/user/0/180_50.jpg?t=1367919013000\" align=\"absmiddle\" alt=\"静风流云\" title=\"静风流云\" class=\"SmallPortrait\" user=\"180\"/></a>\t\t\t\n" +
|
||||||
|
"\t</td>\n" +
|
||||||
|
"\t<td class='body'>\n" +
|
||||||
|
"\t\t<div class='title'>\n" +
|
||||||
|
"\t\t\t1楼:<a href=\"http://my.oschina.net/rox\" target=\"_blank\" name=\"rpl_275599137\">静风流云</a> 发表于 2013-09-01 08:34 \t\t\t\n" +
|
||||||
|
" \t \t <a href=\"javascript:delete_c(158200,180,275599137)\">删除</a>\n" +
|
||||||
|
"\t\t\t\t\t\t\t\t\t <a href=\"javascript:ReplyInline(158200,180,275599137)\">回复此评论</a>\n" +
|
||||||
|
"\t\t\t\t\t</div>\n" +
|
||||||
|
"\t\t<div class='post'\">貌似,OSC也是类似处理的。</div>\n" +
|
||||||
|
"\t\t<div id='inline_reply_of_158200_180_275599137' class='inline_reply'></div>\n" +
|
||||||
|
" </td>\n" +
|
||||||
|
"\t</tr></table>\n" +
|
||||||
|
"</li>\t\t\t\t\t<li id='cmt_158200_190591_275599170'>\n" +
|
||||||
|
"\t<table class='ostable'><tr>\n" +
|
||||||
|
"\t<td class='portrait'>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/flashsword\" target=\"_blank\"><img src=\"http://static.oschina.net/uploads/user/95/190591_50.jpg?t=1347254905000\" align=\"absmiddle\" alt=\"黄亿华\" title=\"黄亿华\" class=\"SmallPortrait\" user=\"190591\"/></a>\t\t\t\n" +
|
||||||
|
"\t</td>\n" +
|
||||||
|
"\t<td class='body'>\n" +
|
||||||
|
"\t\t<div class='title'>\n" +
|
||||||
|
"\t\t\t2楼:<a href=\"http://my.oschina.net/flashsword\" target=\"_blank\" name=\"rpl_275599170\">黄亿华</a> 发表于 2013-09-01 08:37 \t\t\t\n" +
|
||||||
|
" \t \t <a href=\"javascript:delete_c(158200,190591,275599170)\">删除</a>\n" +
|
||||||
|
"\t\t\t\t\t\t\t\t</div>\n" +
|
||||||
|
"\t\t<div class='post'\"><div class=ref><h4>引用来自“静风流云”的评论</h4><p>貌似,OSC也是类似处理的。</p></div>OSC就是使用Jsoup做解析的,见这里:<a href='http://www.oschina.net/p/jsoup' rel='nofollow' target='_blank'>http://www.oschina.net/p/jsoup</a></div>\n" +
|
||||||
|
"\t\t<div id='inline_reply_of_158200_190591_275599170' class='inline_reply'></div>\n" +
|
||||||
|
" </td>\n" +
|
||||||
|
"\t</tr></table>\n" +
|
||||||
|
"</li>\t\t\t\t\t<li id='cmt_158200_234880_275616963'>\n" +
|
||||||
|
"\t<table class='ostable'><tr>\n" +
|
||||||
|
"\t<td class='portrait'>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/searchjack\" target=\"_blank\"><img src=\"http://static.oschina.net/uploads/user/117/234880_50.jpg?t=1362718646000\" align=\"absmiddle\" alt=\"searchjack\" title=\"searchjack\" class=\"SmallPortrait\" user=\"234880\"/></a>\t\t\t\n" +
|
||||||
|
"\t</td>\n" +
|
||||||
|
"\t<td class='body'>\n" +
|
||||||
|
"\t\t<div class='title'>\n" +
|
||||||
|
"\t\t\t3楼:<a href=\"http://my.oschina.net/searchjack\" target=\"_blank\" name=\"rpl_275616963\">searchjack</a> 发表于 2013-09-02 09:20 \t\t\t\n" +
|
||||||
|
" \t \t <a href=\"javascript:delete_c(158200,234880,275616963)\">删除</a>\n" +
|
||||||
|
"\t\t\t\t\t\t\t\t\t <a href=\"javascript:ReplyInline(158200,234880,275616963)\">回复此评论</a>\n" +
|
||||||
|
"\t\t\t\t\t</div>\n" +
|
||||||
|
"\t\t<div class='post'\">极好的工具,</div>\n" +
|
||||||
|
"\t\t<div id='inline_reply_of_158200_234880_275616963' class='inline_reply'></div>\n" +
|
||||||
|
" </td>\n" +
|
||||||
|
"\t</tr></table>\n" +
|
||||||
|
"</li>\t\t\t\t</ul>\n" +
|
||||||
|
"</div>\n" +
|
||||||
|
"\t</div>\n" +
|
||||||
|
"\n" +
|
||||||
|
"<div id='inline_reply_editor' style='display:none;'>\n" +
|
||||||
|
"<div class=\"BlogCommentForm\">\n" +
|
||||||
|
"\t<form id=\"form_inline_comment\" action=\"/action/blog/add_comment?blog=158200\" method=\"POST\">\n" +
|
||||||
|
"\t <input type='hidden' id='inline_reply_id' name='reply_id' value=''/> \n" +
|
||||||
|
" <textarea name=\"content\" style=\"width:550px;height:60px;\" onkeydown=\"if((event.metaKey || event.ctrlKey)&&event.keyCode==13){$('#form_inline_comment').submit();}\"></textarea><br/>\n" +
|
||||||
|
"\t <input type=\"submit\" value=\"回复\" id=\"btn_comment\" class=\"SUBMIT\"/> \n" +
|
||||||
|
"\t <input type=\"button\" value=\"关闭\" class=\"SUBMIT\" id='btn_close_inline_reply'/> 文明上网,理性发言\n" +
|
||||||
|
" </form>\n" +
|
||||||
|
"</div>\n" +
|
||||||
|
"</div>\n" +
|
||||||
|
"<div class='SpaceList' style='margin-top:20px;'>\n" +
|
||||||
|
" <a name=\"comments\" id=\"postform\"></a>\n" +
|
||||||
|
" <div class=\"BlogCommentForm\">\n" +
|
||||||
|
" <form id=\"form_comment\" action=\"/action/blog/add_comment?blog=158200\" method=\"POST\"> \n" +
|
||||||
|
" <textarea id='ta_post_content' name=\"content\" style=\"width:550px;height:100px;\" onkeydown=\"if((event.metaKey || event.ctrlKey)&&event.keyCode==13){$('#form_comment').submit();}\"></textarea><br/>\n" +
|
||||||
|
"\t <input type=\"submit\" value=\"发表评论\" id=\"btn_comment\" class=\"SUBMIT\" /> \n" +
|
||||||
|
"\t <img id=\"submiting\" style=\"display:none\" src=\"/img/loading.gif\" align=\"absmiddle\"/>\n" +
|
||||||
|
"\t <span id='cmt_tip'>文明上网,理性发言</span>\n" +
|
||||||
|
" </form>\n" +
|
||||||
|
"\t<a href=\"#\" class=\"more\">回到页首</a> | <a href=\"#comments\" class=\"more\">回到评论列表</a>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
" </div>\n" +
|
||||||
|
"\t\n" +
|
||||||
|
"<div id=\"RelativeBlogs\">\n" +
|
||||||
|
"\t<strong><a id='btn_close'>关闭</a>相关文章阅读</strong>\n" +
|
||||||
|
"\t<ul>\n" +
|
||||||
|
"\t\t\t<li>\n" +
|
||||||
|
"\t\t<span class='date'>2012/04/04</span>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/soitravel/blog/52366\" title=\"oo原则\">oo原则</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t\t\t<li>\n" +
|
||||||
|
"\t\t<span class='date'>2012/09/03</span>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/wangfree/blog/76273\" title=\"XSS跨站脚本攻击\">XSS跨站脚本攻击</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t\t\t<li>\n" +
|
||||||
|
"\t\t<span class='date'>2012/10/10</span>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/samshuai/blog/82382\" title=\"《蟋蟀的xss淫荡教程之如何劫持OSC用户账号》\">《蟋蟀的xss淫荡教程之如何劫持OSC...</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t\t\t<li>\n" +
|
||||||
|
"\t\t<span class='date'>2013/06/08</span>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/tdoly/blog/136632\" title=\"[Security]XSS一直是个棘手的问题\">[Security]XSS一直是个棘手的问题...</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t\t\t<li>\n" +
|
||||||
|
"\t\t<span class='date'>2013/01/05</span>\n" +
|
||||||
|
"\t\t<a href=\"http://my.oschina.net/sharephper/blog/100107\" title=\"xss攻击\">xss攻击</a>\n" +
|
||||||
|
"\t</li>\n" +
|
||||||
|
"\t\t\t</ul>\n" +
|
||||||
|
"</div>\n" +
|
||||||
|
"<script type=\"text/javascript\" src=\"/action/visit/blog?id=158200\" defer=\"defer\"></script>\n" +
|
||||||
|
"<script type=\"text/javascript\" src=\"/js/syntax-highlighter-2.1.382/scripts/brush.js\"></script>\n" +
|
||||||
|
"<link type=\"text/css\" rel=\"stylesheet\" href=\"/js/syntax-highlighter-2.1.382/styles/shCore.css\"/>\n" +
|
||||||
|
"<link type=\"text/css\" rel=\"stylesheet\" href=\"/js/syntax-highlighter-2.1.382/styles/shThemeDefault.css\"/>\n" +
|
||||||
|
"<script type='text/javascript'><!--\n" +
|
||||||
|
"$(document).ready(function(){\n" +
|
||||||
|
"\tSyntaxHighlighter.config.clipboardSwf = '/js/syntax-highlighter-2.1.382/scripts/clipboard.swf';\n" +
|
||||||
|
"\tSyntaxHighlighter.all();\n" +
|
||||||
|
"});\n" +
|
||||||
|
"//-->\n" +
|
||||||
|
"</script>\n" +
|
||||||
|
"<!--[if lt IE 7]>\n" +
|
||||||
|
"<script type=\"text/javascript\" src=\"/js/minmax.js\"></script>\n" +
|
||||||
|
"<![endif]-->\n" +
|
||||||
|
"<script type='text/javascript'>\n" +
|
||||||
|
"<!--\n" +
|
||||||
|
"var posting = false;\n" +
|
||||||
|
"var upprev_closed = false;\n" +
|
||||||
|
"var upprev_hidden = true;\n" +
|
||||||
|
"\n" +
|
||||||
|
"$(document).ready(function(){\n" +
|
||||||
|
" $('.BlogContent img').css('cursor','pointer');\n" +
|
||||||
|
" jQuery.each($('.BlogContent img'),function(idx,v){\n" +
|
||||||
|
" \t$(v).wrap(\"<a href='\"+$(this).attr('src')+\"' target='_blank'></a>\");\n" +
|
||||||
|
" });\n" +
|
||||||
|
"\t$('#form_comment').ajaxForm({\n" +
|
||||||
|
"\t\tdataType: 'json',\n" +
|
||||||
|
"\t\tbforeSubmit: function(){\n" +
|
||||||
|
"\t\t\tposting = true;\n" +
|
||||||
|
"\t\t},\n" +
|
||||||
|
"\t\tsuccess: function(json) {\n" +
|
||||||
|
" \tif(json.msg){\n" +
|
||||||
|
"\t\t\t\t///alert(json.msg);\n" +
|
||||||
|
"\t\t\t\t$('#cmt_tip').html(\"<span style='color:#C00;'>\"+json.msg+\"</span>\");\n" +
|
||||||
|
"\t\t\t\t$('#ta_post_content').focus();\t\t\t\t\n" +
|
||||||
|
"\t\t\t}else{\n" +
|
||||||
|
"\t\t\t\tvar url = \"http://my.oschina.net/flashsword/blog_post?_cmt_blog=\"+json.blog+\"&_cmt_user=\"+json.user+\"&_cmt_id=\"+json.id;\t\t\t\t\n" +
|
||||||
|
" \t\tjQuery.get(url, function(data){\n" +
|
||||||
|
" \t\t\t\t$('.BlogComments .NoData').hide();\n" +
|
||||||
|
" \t\t\t$('ul#BlogComments').append(data);\n" +
|
||||||
|
" \t\t\t$('#form_comment').resetForm();\n" +
|
||||||
|
" \t\t}); \n" +
|
||||||
|
"\t\t\t}\n" +
|
||||||
|
"\t\t}\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"\n" +
|
||||||
|
" var at_datas = [];\n" +
|
||||||
|
" $('img.SmallPortrait').each(function(){\n" +
|
||||||
|
" var name = $(this).attr('alt');\n" +
|
||||||
|
" if(jQuery.inArray(name, at_datas) < 0 && name != '黄亿华')\n" +
|
||||||
|
" at_datas.push(name);\n" +
|
||||||
|
" });\n" +
|
||||||
|
" $(\"#form_comment textarea\").atWho(\"@\", {data: at_datas});\n" +
|
||||||
|
"\n" +
|
||||||
|
"\t$(\"#submiting\").ajaxStart(function(){\n" +
|
||||||
|
"\t if(posting){\n" +
|
||||||
|
" \t $('#btn_submit').attr(\"disabled\",\"disabled\");\n" +
|
||||||
|
" $(this).show();\n" +
|
||||||
|
"\t }\n" +
|
||||||
|
" });\n" +
|
||||||
|
"\t$(\"#submiting\").ajaxComplete(function(event,request, settings){\n" +
|
||||||
|
"\t if(posting){\n" +
|
||||||
|
" $(this).hide();\n" +
|
||||||
|
" \t $('#btn_submit').attr(\"disabled\",\"\");\n" +
|
||||||
|
"\t }\n" +
|
||||||
|
"\t posting = false;\n" +
|
||||||
|
" }); \n" +
|
||||||
|
"\t\n" +
|
||||||
|
" $(window).scroll(function() {\n" +
|
||||||
|
" var lastScreen;\n" +
|
||||||
|
" if ($(\"#postform\").length > 0)\n" +
|
||||||
|
" lastScreen = getScrollY() + $(window).height() < $(\"#postform\").offset().top * 1 ? false : true;\n" +
|
||||||
|
" else\n" +
|
||||||
|
" lastScreen = getScrollY() + $(window).height() < $(document).height() * 1 ? false : true;\n" +
|
||||||
|
" if (lastScreen && !upprev_closed) {\n" +
|
||||||
|
" $(\"#RelativeBlogs\").stop().animate({right:\"0px\"});\n" +
|
||||||
|
" upprev_hidden = false;\n" +
|
||||||
|
" }\n" +
|
||||||
|
" else if (upprev_closed && getScrollY() == 0) {\n" +
|
||||||
|
" upprev_closed = false;\n" +
|
||||||
|
" }\n" +
|
||||||
|
" else if (!upprev_hidden) {\n" +
|
||||||
|
" upprev_hidden = true;\n" +
|
||||||
|
" $(\"#RelativeBlogs\").stop().animate({right:\"-400px\"});\n" +
|
||||||
|
" }\n" +
|
||||||
|
" });\n" +
|
||||||
|
" $(\"#RelativeBlogs #btn_close\").click(function() {\n" +
|
||||||
|
" $(\"#RelativeBlogs\").stop().animate({right:\"-400px\"});\n" +
|
||||||
|
" upprev_closed = true;\n" +
|
||||||
|
" upprev_hidden = true;\n" +
|
||||||
|
" });\n" +
|
||||||
|
"});\n" +
|
||||||
|
"function delete_c(nid,uid,cid){\n" +
|
||||||
|
" if(confirm(\"您确认要删除此篇评论?\")){\n" +
|
||||||
|
" var args = \"cmt=\"+cid+\"#\"+uid+\"#\"+nid;\n" +
|
||||||
|
" ajax_post(\"/action/blog/delete_blog_comments?space=190591\",args,function(){$(\"#cmt_\"+nid+\"_\"+uid+\"_\"+cid).fadeOut();});\n" +
|
||||||
|
" }\n" +
|
||||||
|
"}\n" +
|
||||||
|
"function ReplyInline(blog,user,reply){\n" +
|
||||||
|
"\t$('.inline_reply').empty();\n" +
|
||||||
|
"\tvar div_id = '#inline_reply_of_'+blog+'_'+user+'_'+reply;\n" +
|
||||||
|
"\t$('#inline_reply_id').val(user+'_'+reply);\n" +
|
||||||
|
"\t$(div_id).html($('#inline_reply_editor').html());\n" +
|
||||||
|
"\t$('#txt_focus').focus();\n" +
|
||||||
|
"\t$('#btn_close_inline_reply').click(function(){\n" +
|
||||||
|
"\t\t$(div_id).empty();\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"\t$('#form_inline_comment').ajaxForm({\n" +
|
||||||
|
"\t\tdataType: 'json',\n" +
|
||||||
|
" \tsuccess: function(json) {\n" +
|
||||||
|
" \tif(json.msg){\n" +
|
||||||
|
" \t\talert(json.msg);\n" +
|
||||||
|
" \t}\n" +
|
||||||
|
" \telse if(json.id){\n" +
|
||||||
|
" \t\t\tlocation.reload();\n" +
|
||||||
|
" \t}\n" +
|
||||||
|
" \t}\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"}\n" +
|
||||||
|
"function edit_catalogs(qid){\n" +
|
||||||
|
"\tpopup(\"/set-catalogs?parent=1&type=3&id=\"+qid);\n" +
|
||||||
|
"}\n" +
|
||||||
|
"function vote(blogid){\n" +
|
||||||
|
"\t\tajax_post(\"/action/blog/vote\",\"id=\"+blogid+\"&user=190591\",function(result){\n" +
|
||||||
|
"\t\tvar json = eval('('+result+')');\n" +
|
||||||
|
"\t\tif(json.vote)\n" +
|
||||||
|
"\t\t\t$('#vote_count').html(json.vote);\n" +
|
||||||
|
"\t\telse if(json.error == 1)\n" +
|
||||||
|
"\t\t\talert(json.msg);\n" +
|
||||||
|
"\t\telse\n" +
|
||||||
|
"\t\t\talert(json.msg);\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"\t}\n" +
|
||||||
|
"function toggle_recomm(blogid){\n" +
|
||||||
|
"\tajax_post(\"/action/blog/toggle_recomm\",\"id=\"+blogid,function(html){\n" +
|
||||||
|
"\t\tif(html == '-1')\n" +
|
||||||
|
"\t\t\talert(\"文章不存在\");\n" +
|
||||||
|
"\t\telse if(html == 0){\n" +
|
||||||
|
"\t\t\t$('#lnk_recomm_'+blogid).removeClass('recommend');\n" +
|
||||||
|
"\t\t\t$('#lnk_recomm_'+blogid).text(\"未推荐\");\n" +
|
||||||
|
"\t\t}\n" +
|
||||||
|
"\t\telse if(html == 1){\n" +
|
||||||
|
"\t\t\t$('#lnk_recomm_'+blogid).addClass('recommend');\n" +
|
||||||
|
"\t\t\t$('#lnk_recomm_'+blogid).text(\"已推荐\");\n" +
|
||||||
|
"\t\t}\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"}\n" +
|
||||||
|
"//-->\n" +
|
||||||
|
"</script></div>\n" +
|
||||||
|
"\t<div class='clear'></div>\n" +
|
||||||
|
"\t<div id=\"OSC_Footer\"><style>\n" +
|
||||||
|
".oscapp {text-align:left; width:220px;}\n" +
|
||||||
|
".oscapp span {float:left;width:140px;}\n" +
|
||||||
|
".oscapp a {float:left;text-indent:-9999em;width:16px;margin-left:8px;}\n" +
|
||||||
|
".oscapp a.android {background:url('/img/android.gif') no-repeat left center;}\n" +
|
||||||
|
".oscapp a.iphone {background:url('/img/iphone.gif') no-repeat left center;}\n" +
|
||||||
|
".oscapp a.wp7 {background:url('/img/wp7.gif') no-repeat left center;}\n" +
|
||||||
|
"</style>\n" +
|
||||||
|
"<table width='100%'><tr>\n" +
|
||||||
|
"<td align='left'>© 开源中国(OsChina.NET) | <a href=\"http://www.oschina.net/home/about\">关于我们</a> | <a href=\"mailto:oschina.net@gmail.com\">广告联系</a> | <a href=\"http://weibo.com/oschina2010\" target=\"_blank\">@新浪微博</a> | <a href=\"http://m.oschina.net/\">开源中国手机版</a> | <a href='http://www.miitbeian.gov.cn/' target='_blank' style='color:#737573;text-decoration:none;'>粤ICP备12009483号-3</a></td>\n" +
|
||||||
|
"<td class='oscapp'>\n" +
|
||||||
|
"\t<span>开源中国手机客户端:</span>\n" +
|
||||||
|
"\t<a href=\"http://www.oschina.net/app\" class='android' title='Android客户端'>Android</a>\n" +
|
||||||
|
"\t<a href=\"http://www.oschina.net/app\" class='iphone' title='iPhone 客户端'>iPhone</a>\n" +
|
||||||
|
"\t<a href=\"http://www.oschina.net/app\" class='wp7' title='Windows Phone 客户端'>WP7</a>\n" +
|
||||||
|
"</td>\n" +
|
||||||
|
"</tr>\n" +
|
||||||
|
"</table>\n" +
|
||||||
|
"<script type='text/javascript'>\n" +
|
||||||
|
"<!--\n" +
|
||||||
|
"if (top.location != self.location)top.location=self.location;\n" +
|
||||||
|
"//-->\n" +
|
||||||
|
"</script></div>\n" +
|
||||||
|
"</div>\n" +
|
||||||
|
"</body>\n" +
|
||||||
|
"\n" +
|
||||||
|
"<script type=\"text/javascript\" src=\"/action/visit/space?id=190591\"></script>\n" +
|
||||||
|
"<script type='text/javascript'>\n" +
|
||||||
|
"<!--\n" +
|
||||||
|
"$(document).ready(function() {\n" +
|
||||||
|
"\n" +
|
||||||
|
"\tSelectStyle(\"#search-item\",\".search_list\");\n" +
|
||||||
|
"\t$('.Tweet .photo img').live(\"click\",function(){\n" +
|
||||||
|
"\t\tvar T=$(this);\n" +
|
||||||
|
"\t\tvar t=this;\n" +
|
||||||
|
"\t\tvar bigImg = T.attr('bi');\n" +
|
||||||
|
"\t\tvar smallImg = T.attr('si');\n" +
|
||||||
|
"\t\tvar src = T.attr('src');\n" +
|
||||||
|
"\t\tvar newsrc = (bigImg == src)?smallImg:bigImg;\n" +
|
||||||
|
"\t\tvar imgId = T.attr('id');\n" +
|
||||||
|
"\t\tif(newsrc == bigImg){\n" +
|
||||||
|
" \t\tvar loading=$('<img alt=\"loading\" src=\"/img/loading.gif\"/>');\n" +
|
||||||
|
"\t\t\tvar top = T.position().top+T.height()/2-8;\n" +
|
||||||
|
"\t\t\tvar left = T.position().left+T.width()/2-8;\n" +
|
||||||
|
"\t\t\tloading.css({\n" +
|
||||||
|
"\t\t\t\t'position':'absolute',\n" +
|
||||||
|
"\t\t\t\t'z-index':999,\n" +
|
||||||
|
"\t\t\t\t'top':top,\n" +
|
||||||
|
"\t\t\t\t'left':left\n" +
|
||||||
|
"\t\t\t});\n" +
|
||||||
|
" \t\tT.before(loading);\n" +
|
||||||
|
"\t\t\tvar tImg=new Image();\n" +
|
||||||
|
"\t\t\ttImg.src=newsrc;\n" +
|
||||||
|
"\t\t\ttImg.onload=function(){afterImgLoad(T,loading,imgId,newsrc,bigImg);};\n" +
|
||||||
|
"\t\t}\n" +
|
||||||
|
"\t\telse{\n" +
|
||||||
|
"\t\t\tT.attr(\"src\",newsrc);\n" +
|
||||||
|
"\t\t\t$('#img_menu_'+imgId).remove();\n" +
|
||||||
|
"\t\t}\n" +
|
||||||
|
"\t\treturn false;\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"\t\n" +
|
||||||
|
"\t$(\".tweet_thumb_wrapper\").mouseenter(function(){\n" +
|
||||||
|
"\t\t$(this).find(\".tweet_play_video\").css(\"opacity\",1);\n" +
|
||||||
|
"\t}).mouseleave(function(){\n" +
|
||||||
|
"\t\t$(this).find(\".tweet_play_video\").css(\"opacity\",0.7);\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"\n" +
|
||||||
|
" $(\"#TForm textarea\").atWho(\"@\", function(query, callback){\n" +
|
||||||
|
" jQuery.ajax({\n" +
|
||||||
|
" type:'POST',\n" +
|
||||||
|
" url:\"/action/tweet/at_suggest\",\n" +
|
||||||
|
" data:{'q':query},\n" +
|
||||||
|
" dataType:'json',\n" +
|
||||||
|
" success:function(json){\n" +
|
||||||
|
" callback(json);\n" +
|
||||||
|
" }\n" +
|
||||||
|
" });\n" +
|
||||||
|
" });\n" +
|
||||||
|
"\t\n" +
|
||||||
|
"\ttoggle_tweet_video = function(id){\n" +
|
||||||
|
"\t\t$(\"#tweet_video_thumb_\"+id).toggle();\n" +
|
||||||
|
"\t\tvar video = $(\"#tweet_video_\"+id).toggle();\n" +
|
||||||
|
"\t\tvideo.siblings(\".tweet_video_operation,.tweet_thumb_wrapper\").toggle();\n" +
|
||||||
|
"\t};\n" +
|
||||||
|
"\t\n" +
|
||||||
|
"\tfunction afterImgLoad(T,loading,imgId,url,bigImg){\n" +
|
||||||
|
"\t\tvar lnks = \"<div id='img_menu_\"+imgId+\"' class='ImgMenu'>\";\n" +
|
||||||
|
"\t\tlnks += \"<a href='#' onclick='$(\\\"#\"+imgId+\"\\\").click();return false;'>收起</a>\";\n" +
|
||||||
|
"\t\tlnks += \"<a href='\"+bigImg+\"' target='_blank'>查看原图</a></div>\";\t\t\t\n" +
|
||||||
|
"\t\tloading.remove();\n" +
|
||||||
|
"\t\tT.attr(\"src\",url);\n" +
|
||||||
|
"\t\tT.before(lnks);\n" +
|
||||||
|
"\t}\n" +
|
||||||
|
"});\n" +
|
||||||
|
"\n" +
|
||||||
|
"function set_fellow_memo(fid,fname){\n" +
|
||||||
|
"\tpopup(\"/action/ajax/set_fellow_memo\",\"friend=\"+fid+\"&name=\"+fname);\n" +
|
||||||
|
"}\n" +
|
||||||
|
"\n" +
|
||||||
|
"function deleteMsgs(uid, fid, fname){\n" +
|
||||||
|
"\tif(!confirm(\"你确认要清除与‘\"+fname+\"’的所有留言信息吗?\"))\n" +
|
||||||
|
"\t\treturn ;\n" +
|
||||||
|
"\tvar args = \"user=\"+uid+\"&friend=\"+fid;\n" +
|
||||||
|
"\tajax_post(\"/action/msg/delete_user\",args,function(html){\n" +
|
||||||
|
"\t\tif(html.length > 0)\n" +
|
||||||
|
"\t\t\talert(html);\n" +
|
||||||
|
"\t\telse{\n" +
|
||||||
|
"\t\t\t$('#Msg_'+fid).fadeOut();\n" +
|
||||||
|
"\t\t}\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"}\n" +
|
||||||
|
"\n" +
|
||||||
|
"function follow_user(uid, uname){\n" +
|
||||||
|
"\tjust_follow(uid, uname,'190591'); //oschina.js\n" +
|
||||||
|
"}\n" +
|
||||||
|
"\n" +
|
||||||
|
"function unfollow_user(uid, uname){\n" +
|
||||||
|
"\tif(confirm(\"确定不再关注\" + uname + \"了吗?\"))\n" +
|
||||||
|
"\tjust_unfollow(uid,'190591',function(){\n" +
|
||||||
|
"\t\talert('已取消对 ' + uname + ' 的关注');\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"}\n" +
|
||||||
|
"\n" +
|
||||||
|
"function tweet_reply(logid){\n" +
|
||||||
|
"\tvar r = $('#LogReply_'+logid);\n" +
|
||||||
|
"\tif(!r.is(\":hidden\")){\n" +
|
||||||
|
"\t\tclose_tweet_reply(logid);\n" +
|
||||||
|
"\t\treturn ;\n" +
|
||||||
|
"\t}\n" +
|
||||||
|
"\tr.html(\"<div class='TweetRplsWrapper'><span class='loading'>正在加载评论,请稍候...</span></div>\")\n" +
|
||||||
|
"\tr.show();\n" +
|
||||||
|
"\tr.load(\"http://my.oschina.net/flashsword/tweet-rpls?log=\"+logid,function(){\n" +
|
||||||
|
"\t\t$('#edt_tweet_post_'+logid).focus();\n" +
|
||||||
|
" var at_datas = [];\n" +
|
||||||
|
" $(this).find(\"img.SmallPortrait\").each(function(){\n" +
|
||||||
|
" var name = $(this).attr('alt');\n" +
|
||||||
|
" if(jQuery.inArray(name, at_datas) < 0 && name != '黄亿华')\n" +
|
||||||
|
" at_datas.push(name);\n" +
|
||||||
|
" });\n" +
|
||||||
|
" $(this).find(\"input.TXT_TweetRpl_Text\").atWho(\"@\", {data: at_datas});\n" +
|
||||||
|
" $('#TweetReplyForm_'+logid).ajaxForm({\n" +
|
||||||
|
" \tdataType: 'json',\n" +
|
||||||
|
"\t\t\tbeforeSubmit: function(arr, form, options){\n" +
|
||||||
|
"\t\t\t\t$('#BTN_TweetReply_'+logid).attr('disabled','disabled');\n" +
|
||||||
|
"\t\t\t},\n" +
|
||||||
|
" success: function(json) {\n" +
|
||||||
|
" \tif(json.msg){\n" +
|
||||||
|
" \t\t\talert(json.msg);\n" +
|
||||||
|
" \t}else if(json.log){\n" +
|
||||||
|
"\t\t\t\t\t$('#log_reply_count_'+logid).text(json.reply_count);\n" +
|
||||||
|
" \t\t\t//插入新评论\t\t\t\t\t\n" +
|
||||||
|
"\t\t\t\t\tajax_get(\"/action/ajax/get_tweet_reply?id=\" + json.log,true,function(html){\n" +
|
||||||
|
"\t\t\t\t\t\t$('#LogReply_'+logid+' ul').prepend(html);\n" +
|
||||||
|
"\t\t\t\t\t});\n" +
|
||||||
|
"\t\t\t\t\t$('#edt_tweet_post_'+logid).val('');\n" +
|
||||||
|
" \t}\n" +
|
||||||
|
"\t\t\t\t$('#BTN_TweetReply_'+logid).removeAttr('disabled');\n" +
|
||||||
|
" }\n" +
|
||||||
|
" });\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"}\n" +
|
||||||
|
"function close_tweet_reply(logid){\n" +
|
||||||
|
"\t$('#LogReply_'+logid).empty();\n" +
|
||||||
|
"\t$('#LogReply_'+logid).hide();\n" +
|
||||||
|
"\t$('#Logs .userlogs li').removeClass('hover');\n" +
|
||||||
|
"}\n" +
|
||||||
|
"function reply_rtweet(logid, rid, toname){\n" +
|
||||||
|
"\tvar edtPost = $('#edt_tweet_post_' + logid);\n" +
|
||||||
|
"\tvar old_v = edtPost.val();\n" +
|
||||||
|
"\tif(old_v.length > 0)\n" +
|
||||||
|
"\t\tedtPost.val(old_v + ',@'+toname+' ');\n" +
|
||||||
|
"\telse\n" +
|
||||||
|
"\t\tedtPost.val('回复 @'+toname+' : ');\n" +
|
||||||
|
"\tedtPost.focus();\n" +
|
||||||
|
"\tedtPost.caretPos(edtPost.val().length); }\n" +
|
||||||
|
"function delete_tweet(logid){\n" +
|
||||||
|
"\tif(confirm(\"确认要删除这条信息吗?\"))\n" +
|
||||||
|
"\tajax_post(\"/action/tweet/delete?log=\"+logid+\"&user=190591\",\"\",function(html){\n" +
|
||||||
|
"\t\tif(html.length==0){\n" +
|
||||||
|
"\t\t\tvar elem = $('#LI_'+logid);\n" +
|
||||||
|
"\t\t\tif(elem.length > 0)\n" +
|
||||||
|
"\t\t\t\t$('#LI_'+logid).fadeOut();\n" +
|
||||||
|
"\t\t\telse\n" +
|
||||||
|
"\t\t\t\tlocation.reload();\n" +
|
||||||
|
"\t\t}\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"}\n" +
|
||||||
|
"function delete_tweet_reply(logid){\n" +
|
||||||
|
"\tif(confirm(\"确认要删除这条评论吗?\"))\n" +
|
||||||
|
"\tajax_post(\"/action/tweet/delete_reply?id=\"+logid+\"&user=190591\",\"\",function(html){\n" +
|
||||||
|
"\t\tif(html.length==0)\n" +
|
||||||
|
"\t\t\t$('#TweetReply_'+logid).fadeOut();\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"}\n" +
|
||||||
|
"\n" +
|
||||||
|
"function delete_blog(blog_id){\n" +
|
||||||
|
" if(!confirm(\"文章删除后无法恢复,请确认是否删除此篇文章?\")) return;\n" +
|
||||||
|
" ajax_post(\"/action/blog/delete?id=\"+blog_id+\"&user=190591&user_code=tzm9Wg2YoU8SkJaTIjHQkahStiXQNyymUGXFOQgN\",\"\",function(html){\n" +
|
||||||
|
" \tlocation.href=\"http://my.oschina.net/flashsword/blog\";\n" +
|
||||||
|
" });\n" +
|
||||||
|
"}\n" +
|
||||||
|
"\n" +
|
||||||
|
"function SelectStyle(on,option){\n" +
|
||||||
|
"\tvar currentSort = $(on).attr('id');\n" +
|
||||||
|
"\tvar currentText = $(option+\" li.\"+currentSort+\" a\").html();\n" +
|
||||||
|
"\t$(on + \" .text\").html(currentText);\n" +
|
||||||
|
"\t$(on + \" .text\").hover(function(){\n" +
|
||||||
|
"\t\t$(this).addClass(\"hover\")\n" +
|
||||||
|
"\t},function(){\n" +
|
||||||
|
"\t\t$(this).removeClass(\"hover\")\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"\t$(option+\" li a\").each(function(index){\n" +
|
||||||
|
"\t\t$(this).click(function(){\n" +
|
||||||
|
"\t\t\tthishtml = $(this).html();\n" +
|
||||||
|
"\t\t\t$(on + \" .text\").removeClass(\"on\").html(thishtml);\t\t\n" +
|
||||||
|
"\t\t\t$(\".selectbox select \").find(\"option\").removeAttr('selected').eq(index).attr(\"selected\",\"selected\");\t\n" +
|
||||||
|
"\t\t\t$(option).hide()\n" +
|
||||||
|
"\t\t\treturn false;\n" +
|
||||||
|
"\t\t});\n" +
|
||||||
|
"\t\t\n" +
|
||||||
|
"\t});\t\t\n" +
|
||||||
|
"\t\n" +
|
||||||
|
"\t$(\".selectbox\").click(function(){\t\t\n" +
|
||||||
|
"\t\t$(option).toggle();\n" +
|
||||||
|
"\t\t$(on + \" .text\").toggleClass(\"on\");\t\t\n" +
|
||||||
|
"\t\treturn false;\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"\t$(document).click(function(){\n" +
|
||||||
|
"\t\t$(option).hide();\t\n" +
|
||||||
|
"\t\t$(on + \" .text\").removeClass(\"on\");\n" +
|
||||||
|
"\t});\n" +
|
||||||
|
"\t$(document).trigger('click');\n" +
|
||||||
|
"\n" +
|
||||||
|
"}\n" +
|
||||||
|
"\n" +
|
||||||
|
"//-->\n" +
|
||||||
|
"</script>\n" +
|
||||||
|
"</html>\n" +
|
||||||
|
"\n" +
|
||||||
|
"<!-- Generated by OsChina.NET (init:0[ms],page:83[ms],ip:58.241.37.50) -->";
|
||||||
|
}
|
|
@ -0,0 +1,24 @@
|
||||||
|
package us.codecraft.webmagic.samples.scheduler;
|
||||||
|
|
||||||
|
import org.junit.Ignore;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class DelayQueueSchedulerTest {
|
||||||
|
|
||||||
|
@Ignore("infinite")
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
DelayQueueScheduler delayQueueScheduler = new DelayQueueScheduler(1, TimeUnit.SECONDS);
|
||||||
|
delayQueueScheduler.push(new Request("1"), null);
|
||||||
|
while (true){
|
||||||
|
Request poll = delayQueueScheduler.poll(null);
|
||||||
|
System.out.println(System.currentTimeMillis()+"\t"+poll);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,11 @@
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>xsoup</artifactId>
|
||||||
|
<version>0.0.1-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>net.sf.saxon</groupId>
|
<groupId>net.sf.saxon</groupId>
|
||||||
<artifactId>Saxon-HE</artifactId>
|
<artifactId>Saxon-HE</artifactId>
|
||||||
|
|
|
@ -1,8 +1,15 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.htmlcleaner.HtmlCleaner;
|
||||||
|
import org.htmlcleaner.TagNode;
|
||||||
|
import org.htmlcleaner.XPatherException;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import us.codecraft.xsoup.XPathEvaluator;
|
||||||
|
import us.codecraft.xsoup.Xsoup;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06
|
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06
|
||||||
|
@ -1353,6 +1360,7 @@ public class XpathSelectorTest {
|
||||||
Html html1 = new Html(html);
|
Html html1 = new Html(html);
|
||||||
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
|
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
|
||||||
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
|
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
|
||||||
|
Selectors.xpath("/abc/").select("");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -1379,17 +1387,86 @@ public class XpathSelectorTest {
|
||||||
xpath2Selector.selectList(html);
|
xpath2Selector.selectList(html);
|
||||||
}
|
}
|
||||||
System.out.println(System.currentTimeMillis()-time);
|
System.out.println(System.currentTimeMillis()-time);
|
||||||
|
|
||||||
XpathSelector xpathSelector = new XpathSelector("//a");
|
XpathSelector xpathSelector = new XpathSelector("//a");
|
||||||
time =System.currentTimeMillis();
|
time =System.currentTimeMillis();
|
||||||
for (int i = 0; i < 1000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
xpathSelector.selectList(html);
|
xpathSelector.selectList(html);
|
||||||
}
|
}
|
||||||
System.out.println(System.currentTimeMillis()-time);
|
System.out.println(System.currentTimeMillis()-time);
|
||||||
|
|
||||||
time =System.currentTimeMillis();
|
time =System.currentTimeMillis();
|
||||||
for (int i = 0; i < 1000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
xpath2Selector.selectList(html);
|
xpath2Selector.selectList(html);
|
||||||
}
|
}
|
||||||
|
System.out.println(System.currentTimeMillis() - time);
|
||||||
|
|
||||||
|
CssSelector cssSelector = new CssSelector("a");
|
||||||
|
time =System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
cssSelector.selectList(html);
|
||||||
|
}
|
||||||
|
System.out.println("css "+(System.currentTimeMillis()-time));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Ignore("take long time")
|
||||||
|
@Test
|
||||||
|
public void parserPerformanceTest() throws XPatherException {
|
||||||
|
System.out.println(html.length());
|
||||||
|
|
||||||
|
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||||
|
TagNode tagNode = htmlCleaner.clean(html);
|
||||||
|
Document document = Jsoup.parse(html);
|
||||||
|
|
||||||
|
long time =System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 2000; i++) {
|
||||||
|
htmlCleaner.clean(html);
|
||||||
|
}
|
||||||
System.out.println(System.currentTimeMillis()-time);
|
System.out.println(System.currentTimeMillis()-time);
|
||||||
|
|
||||||
|
time =System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 2000; i++) {
|
||||||
|
tagNode.evaluateXPath("//a");
|
||||||
|
}
|
||||||
|
System.out.println(System.currentTimeMillis()-time);
|
||||||
|
|
||||||
|
System.out.println("=============");
|
||||||
|
|
||||||
|
time =System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 2000; i++) {
|
||||||
|
Jsoup.parse(html);
|
||||||
|
}
|
||||||
|
System.out.println(System.currentTimeMillis()-time);
|
||||||
|
|
||||||
|
time =System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 2000; i++) {
|
||||||
|
document.select("a");
|
||||||
|
}
|
||||||
|
System.out.println(System.currentTimeMillis()-time);
|
||||||
|
|
||||||
|
System.out.println("=============");
|
||||||
|
|
||||||
|
time =System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 2000; i++) {
|
||||||
|
htmlCleaner.clean(html);
|
||||||
|
}
|
||||||
|
System.out.println(System.currentTimeMillis()-time);
|
||||||
|
|
||||||
|
time =System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 2000; i++) {
|
||||||
|
tagNode.evaluateXPath("//a");
|
||||||
|
}
|
||||||
|
System.out.println(System.currentTimeMillis()-time);
|
||||||
|
|
||||||
|
System.out.println("=============");
|
||||||
|
|
||||||
|
XPathEvaluator compile = Xsoup.compile("//a");
|
||||||
|
time =System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 2000; i++) {
|
||||||
|
compile.evaluate(document);
|
||||||
|
}
|
||||||
|
System.out.println(System.currentTimeMillis()-time);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,23 +29,18 @@ Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitca
|
||||||
|
|
||||||
### 使用maven
|
### 使用maven
|
||||||
|
|
||||||
webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译:
|
webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic:
|
||||||
|
|
||||||
git clone https://github.com/code4craft/webmagic.git
|
|
||||||
cd webmagic
|
|
||||||
mvn clean install
|
|
||||||
|
|
||||||
安装后,在项目中添加对应的依赖即可使用webmagic:
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.2.0</version>
|
<version>0.3.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.2.0</version>
|
<version>0.3.0
|
||||||
|
</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
#### 项目结构
|
#### 项目结构
|
||||||
|
@ -60,7 +55,7 @@ webmagic主要包括两个包:
|
||||||
|
|
||||||
webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。
|
webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。
|
||||||
|
|
||||||
webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来:
|
webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来,这些包需要下载源码后自己编译::
|
||||||
|
|
||||||
* **webmagic-saxon**
|
* **webmagic-saxon**
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue