update version to 0.5.0-snapshot
parent
6933029ea5
commit
af809c4d55
12
pom.xml
12
pom.xml
|
@ -6,7 +6,7 @@
|
|||
<version>7</version>
|
||||
</parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<packaging>pom</packaging>
|
||||
<properties>
|
||||
|
@ -76,6 +76,16 @@
|
|||
<artifactId>guava</artifactId>
|
||||
<version>15.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>1.7.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<version>1.7.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>xsoup</artifactId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -23,7 +23,6 @@
|
|||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>15.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -37,8 +36,13 @@
|
|||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
|
|
@ -2,7 +2,8 @@ package us.codecraft.webmagic;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.downloader.Downloader;
|
||||
import us.codecraft.webmagic.downloader.HttpClientDownloader;
|
||||
import us.codecraft.webmagic.pipeline.CollectorPipeline;
|
||||
|
@ -18,7 +19,10 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
|||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
@ -72,7 +76,7 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
protected Scheduler scheduler = new QueueScheduler();
|
||||
|
||||
protected Logger logger = Logger.getLogger(getClass());
|
||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
protected ExecutorService executorService;
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
package us.codecraft.webmagic.configurable;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface Inject {
|
||||
|
||||
String value() default "";
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package us.codecraft.webmagic.example;
|
||||
|
||||
import java.util.List;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.configurable.Inject;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
*/
|
||||
public class ConfigurableBlogPageProcesser implements PageProcessor {
|
||||
|
||||
private Site site = Site.me().setDomain("my.oschina.net");
|
||||
|
||||
@Inject("linkRegex")
|
||||
private String linkRegex;
|
||||
|
||||
@Inject("titleXpath")
|
||||
private String titleXpath;
|
||||
|
||||
@Inject("contentXpath")
|
||||
private String contentXpath;
|
||||
|
||||
@Inject("tagsXpath")
|
||||
private String tagsXpath;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> links = page.getHtml().links().regex(linkRegex).all();
|
||||
page.addTargetRequests(links);
|
||||
page.putField("title", page.getHtml().xpath(titleXpath).toString());
|
||||
if (page.getResultItems().get("title") == null) {
|
||||
//skip this page
|
||||
page.setSkip(true);
|
||||
}
|
||||
page.putField("content", page.getHtml().smartContent().toString());
|
||||
page.putField("tags", page.getHtml().xpath(tagsXpath).all());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new ConfigurableBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
|
||||
}
|
||||
}
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -5,19 +5,18 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-panel</artifactId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-scripts</artifactId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -5,20 +5,19 @@
|
|||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>0.5.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-worker</artifactId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<packaging>war</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-scripts</artifactId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
|
Loading…
Reference in New Issue