Merge branch 'master' of github.com:code4craft/webmagic
Conflicts: README.md webmagic-samples/pom.xml webmagic-selenium/pom.xmlmaster
commit
7c41bec92f
|
@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.4.1</version>
|
||||
<version>0.4.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.4.1</version>
|
||||
<version>0.4.2</version>
|
||||
</dependency>
|
||||
|
||||
#### 项目结构
|
||||
|
|
|
@ -28,12 +28,12 @@ Add dependencies to your project:
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.4.0</version>
|
||||
<version>0.4.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.4.0</version>
|
||||
<version>0.4.2</version>
|
||||
</dependency>
|
||||
|
||||
## Get Started:
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -6,7 +6,7 @@
|
|||
<version>7</version>
|
||||
</parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.2-SNAPSHOT</version>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>pom</packaging>
|
||||
<properties>
|
||||
|
|
|
@ -27,12 +27,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.4.0</version>
|
||||
<version>0.4.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.4.0</version>
|
||||
<version>0.4.2</version>
|
||||
</dependency>
|
||||
|
||||
#### 项目结构
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.4.2-SNAPSHOT</version>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -70,6 +70,7 @@ public class HttpClientDownloader implements Downloader {
|
|||
CloseableHttpClient httpClient = httpClients.get(domain);
|
||||
if (httpClient == null) {
|
||||
synchronized (this) {
|
||||
httpClient = httpClients.get(domain);
|
||||
if (httpClient == null) {
|
||||
httpClient = httpClientGenerator.getClient(site);
|
||||
httpClients.put(domain, httpClient);
|
||||
|
@ -104,6 +105,7 @@ public class HttpClientDownloader implements Downloader {
|
|||
}
|
||||
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
|
||||
.setConnectionRequestTimeout(site.getTimeOut())
|
||||
.setSocketTimeout(site.getTimeOut())
|
||||
.setConnectTimeout(site.getTimeOut())
|
||||
.setCookieSpec(CookieSpecs.BEST_MATCH);
|
||||
if (site != null && site.getHttpProxy() != null) {
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.4.2-SNAPSHOT</version>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -23,14 +23,18 @@ public class AppStore {
|
|||
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..userRatingCount")
|
||||
private int userRatingCount;
|
||||
|
||||
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..screenshotUrls",multi = true)
|
||||
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..screenshotUrls")
|
||||
private List<String> screenshotUrls;
|
||||
|
||||
@ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..supportedDevices")
|
||||
private List<String> supportedDevices;
|
||||
|
||||
public static void main(String[] args) {
|
||||
AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
|
||||
System.out.println(appStore.trackName);
|
||||
System.out.println(appStore.description);
|
||||
System.out.println(appStore.userRatingCount);
|
||||
System.out.println(appStore.screenshotUrls);
|
||||
System.out.println(appStore.supportedDevices);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -131,7 +131,9 @@ class PageModelExtractor {
|
|||
if (regexPattern.trim().equals("")) {
|
||||
regexPattern = ".*";
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
|
||||
fieldExtractor = new FieldExtractor(field,
|
||||
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
|
||||
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
|
@ -157,7 +159,7 @@ class PageModelExtractor {
|
|||
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
|
||||
}
|
||||
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
|
||||
comboExtract.notNull(), comboExtract.multi());
|
||||
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
|
@ -172,7 +174,7 @@ class PageModelExtractor {
|
|||
if (extractBy != null) {
|
||||
Selector selector = ExtractorUtils.getSelector(extractBy);
|
||||
fieldExtractor = new FieldExtractor(field, selector, extractBy.source() == ExtractBy.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
|
||||
extractBy.notNull(), extractBy.multi());
|
||||
extractBy.notNull(), extractBy.multi() || List.class.isAssignableFrom(field.getType()));
|
||||
Method setterMethod = getSetterMethod(clazz, field);
|
||||
if (setterMethod != null) {
|
||||
fieldExtractor.setSetterMethod(setterMethod);
|
||||
|
@ -359,7 +361,7 @@ class PageModelExtractor {
|
|||
}
|
||||
|
||||
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
|
||||
if (value==null){
|
||||
if (value == null) {
|
||||
return;
|
||||
}
|
||||
if (fieldExtractor.getSetterMethod() != null) {
|
||||
|
|
|
@ -75,6 +75,8 @@ public @interface ComboExtract {
|
|||
* Define whether the extractor return more than one result.
|
||||
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
|
||||
*
|
||||
* Deprecated since 0.4.2. This option is determined automatically by the class of field.
|
||||
* @deprecated since 0.4.2
|
||||
* @return whether the extractor return more than one result
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
|
|
@ -67,6 +67,8 @@ public @interface ExtractBy {
|
|||
* Define whether the extractor return more than one result.
|
||||
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
|
||||
*
|
||||
* Deprecated since 0.4.2. This option is determined automatically by the class of field.
|
||||
* @deprecated since 0.4.2
|
||||
* @return whether the extractor return more than one result
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
|
|
@ -33,6 +33,8 @@ public @interface ExtractByUrl {
|
|||
* Define whether the extractor return more than one result.
|
||||
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
|
||||
*
|
||||
* Deprecated since 0.4.2. This option is determined automatically by the class of field.
|
||||
* @deprecated since 0.4.2
|
||||
* @return whether the extractor return more than one result
|
||||
*/
|
||||
boolean multi() default false;
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.0</version>
|
||||
<version>0.4.2</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.1</version>
|
||||
<version>0.4.2</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.0</version>
|
||||
<version>0.4.2</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.2-SNAPSHOT</version>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -31,6 +31,11 @@
|
|||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
|
|
@ -92,7 +92,8 @@ public class ScriptConsole {
|
|||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
|
||||
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
|
||||
pageProcessor.getSite().setSleepTime(params.getSleepTime());
|
||||
pageProcessor.getSite().setAcceptStatCode(Sets.<Integer>newHashSet(200, 404, 500));
|
||||
pageProcessor.getSite().setRetryTimes(3);
|
||||
pageProcessor.getSite().setAcceptStatCode(Sets.<Integer>newHashSet(200, 404,403, 500,502));
|
||||
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
|
||||
spider.clearPipeline().addPipeline(new Pipeline() {
|
||||
@Override
|
||||
|
|
|
@ -34,6 +34,7 @@ public class ScriptEnginePool {
|
|||
|
||||
public void release(ScriptEngine scriptEngine){
|
||||
scriptEngines.add(scriptEngine);
|
||||
availableCount.incrementAndGet();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.1</version>
|
||||
<version>0.4.2</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -34,12 +34,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
|||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.4.0</version>
|
||||
<version>0.4.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.4.0</version>
|
||||
<version>0.4.2</version>
|
||||
</dependency>
|
||||
|
||||
#### 项目结构
|
||||
|
|
Loading…
Reference in New Issue