update to 0.3.1
commit
a2fba8caa2
|
@ -1,3 +1,4 @@
|
||||||
target/*
|
target/*
|
||||||
*.iml
|
*.iml
|
||||||
out/
|
out/
|
||||||
|
.idea
|
||||||
|
|
10
pom.xml
10
pom.xml
|
@ -6,7 +6,7 @@
|
||||||
<version>7</version>
|
<version>7</version>
|
||||||
</parent>
|
</parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.3.0</version>
|
<version>0.3.1</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
@ -109,6 +109,14 @@
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-surefire-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<forkMode>pertest</forkMode>
|
||||||
|
<argLine>-Xms1024m -Xmx1024m -Xss1m </argLine>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
|
|
@ -1,11 +0,0 @@
|
||||||
#release configuration
|
|
||||||
#Tue Aug 20 23:36:56 CST 2013
|
|
||||||
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
|
||||||
pushChanges=true
|
|
||||||
scm.url=scm\:git\:git@github.com\:code4craft/webmagic.git
|
|
||||||
preparationGoals=clean verify
|
|
||||||
remoteTagging=true
|
|
||||||
scm.commentPrefix=[maven-release-plugin]
|
|
||||||
exec.additionalArguments=-Psonatype-oss-release -P development
|
|
||||||
exec.snapshotReleasePluginAllowed=false
|
|
||||||
completedPhase=check-poms
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.3.0</version>
|
<version>0.3.1</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -79,22 +79,22 @@ public class Spider implements Runnable, Task {
|
||||||
* create a spider with pageProcessor.
|
* create a spider with pageProcessor.
|
||||||
*
|
*
|
||||||
* @param pageProcessor
|
* @param pageProcessor
|
||||||
|
* @return new spider
|
||||||
|
* @see PageProcessor
|
||||||
*/
|
*/
|
||||||
public Spider(PageProcessor pageProcessor) {
|
public static Spider create(PageProcessor pageProcessor) {
|
||||||
this.pageProcessor = pageProcessor;
|
return new Spider(pageProcessor);
|
||||||
this.site = pageProcessor.getSite();
|
|
||||||
this.startUrls = pageProcessor.getSite().getStartUrls();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a spider with pageProcessor.
|
* create a spider with pageProcessor.
|
||||||
*
|
*
|
||||||
* @param pageProcessor
|
* @param pageProcessor
|
||||||
* @return new spider
|
|
||||||
* @see PageProcessor
|
|
||||||
*/
|
*/
|
||||||
public static Spider create(PageProcessor pageProcessor) {
|
public Spider(PageProcessor pageProcessor) {
|
||||||
return new Spider(pageProcessor);
|
this.pageProcessor = pageProcessor;
|
||||||
|
this.site = pageProcessor.getSite();
|
||||||
|
this.startUrls = pageProcessor.getSite().getStartUrls();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -105,7 +105,7 @@ public class Spider implements Runnable, Task {
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider startUrls(List<String> startUrls) {
|
public Spider startUrls(List<String> startUrls) {
|
||||||
checkIfNotRunning();
|
checkIfRunning();
|
||||||
this.startUrls = startUrls;
|
this.startUrls = startUrls;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -139,11 +139,11 @@ public class Spider implements Runnable, Task {
|
||||||
*
|
*
|
||||||
* @param scheduler
|
* @param scheduler
|
||||||
* @return this
|
* @return this
|
||||||
* @since 0.2.1
|
|
||||||
* @see Scheduler
|
* @see Scheduler
|
||||||
|
* @since 0.2.1
|
||||||
*/
|
*/
|
||||||
public Spider setScheduler(Scheduler scheduler) {
|
public Spider setScheduler(Scheduler scheduler) {
|
||||||
checkIfNotRunning();
|
checkIfRunning();
|
||||||
this.scheduler = scheduler;
|
this.scheduler = scheduler;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -153,8 +153,8 @@ public class Spider implements Runnable, Task {
|
||||||
*
|
*
|
||||||
* @param pipeline
|
* @param pipeline
|
||||||
* @return this
|
* @return this
|
||||||
* @deprecated
|
|
||||||
* @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline)
|
* @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline)
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
public Spider pipeline(Pipeline pipeline) {
|
public Spider pipeline(Pipeline pipeline) {
|
||||||
return addPipeline(pipeline);
|
return addPipeline(pipeline);
|
||||||
|
@ -165,11 +165,11 @@ public class Spider implements Runnable, Task {
|
||||||
*
|
*
|
||||||
* @param pipeline
|
* @param pipeline
|
||||||
* @return this
|
* @return this
|
||||||
* @since 0.2.1
|
|
||||||
* @see Pipeline
|
* @see Pipeline
|
||||||
|
* @since 0.2.1
|
||||||
*/
|
*/
|
||||||
public Spider addPipeline(Pipeline pipeline) {
|
public Spider addPipeline(Pipeline pipeline) {
|
||||||
checkIfNotRunning();
|
checkIfRunning();
|
||||||
this.pipelines.add(pipeline);
|
this.pipelines.add(pipeline);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -189,8 +189,8 @@ public class Spider implements Runnable, Task {
|
||||||
*
|
*
|
||||||
* @param downloader
|
* @param downloader
|
||||||
* @return this
|
* @return this
|
||||||
* @deprecated
|
|
||||||
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
|
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
public Spider downloader(Downloader downloader) {
|
public Spider downloader(Downloader downloader) {
|
||||||
return setDownloader(downloader);
|
return setDownloader(downloader);
|
||||||
|
@ -198,12 +198,13 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set the downloader of spider
|
* set the downloader of spider
|
||||||
* @see Downloader
|
*
|
||||||
* @param downloader
|
* @param downloader
|
||||||
* @return this
|
* @return this
|
||||||
|
* @see Downloader
|
||||||
*/
|
*/
|
||||||
public Spider setDownloader(Downloader downloader) {
|
public Spider setDownloader(Downloader downloader) {
|
||||||
checkIfNotRunning();
|
checkIfRunning();
|
||||||
this.downloader = downloader;
|
this.downloader = downloader;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -220,7 +221,8 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) {
|
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)
|
||||||
|
&& !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) {
|
||||||
throw new IllegalStateException("Spider is already running!");
|
throw new IllegalStateException("Spider is already running!");
|
||||||
}
|
}
|
||||||
checkComponent();
|
checkComponent();
|
||||||
|
@ -228,18 +230,19 @@ public class Spider implements Runnable, Task {
|
||||||
for (String startUrl : startUrls) {
|
for (String startUrl : startUrls) {
|
||||||
scheduler.push(new Request(startUrl), this);
|
scheduler.push(new Request(startUrl), this);
|
||||||
}
|
}
|
||||||
|
startUrls.clear();
|
||||||
}
|
}
|
||||||
Request request = scheduler.poll(this);
|
Request request = scheduler.poll(this);
|
||||||
//singel thread
|
//single thread
|
||||||
if (executorService == null) {
|
if (executorService == null) {
|
||||||
while (request != null) {
|
while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
|
||||||
processRequest(request);
|
processRequest(request);
|
||||||
request = scheduler.poll(this);
|
request = scheduler.poll(this);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//multi thread
|
//multi thread
|
||||||
final AtomicInteger threadAlive = new AtomicInteger(0);
|
final AtomicInteger threadAlive = new AtomicInteger(0);
|
||||||
while (true) {
|
while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
|
||||||
if (request == null) {
|
if (request == null) {
|
||||||
//when no request found but some thread is alive, sleep a while.
|
//when no request found but some thread is alive, sleep a while.
|
||||||
try {
|
try {
|
||||||
|
@ -311,7 +314,7 @@ public class Spider implements Runnable, Task {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
//for cycle retry
|
//for cycle retry
|
||||||
if (page.getHtml()==null){
|
if (page.getHtml() == null) {
|
||||||
addRequest(page);
|
addRequest(page);
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
return;
|
return;
|
||||||
|
@ -342,8 +345,8 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void checkIfNotRunning() {
|
protected void checkIfRunning() {
|
||||||
if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) {
|
if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) {
|
||||||
throw new IllegalStateException("Spider is already running!");
|
throw new IllegalStateException("Spider is already running!");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -354,6 +357,19 @@ public class Spider implements Runnable, Task {
|
||||||
thread.start();
|
thread.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void start() {
|
||||||
|
runAsync();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void stop() {
|
||||||
|
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void stopAndDestroy() {
|
||||||
|
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
|
||||||
|
destroy();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* start with more than one threads
|
* start with more than one threads
|
||||||
*
|
*
|
||||||
|
@ -361,7 +377,7 @@ public class Spider implements Runnable, Task {
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider thread(int threadNum) {
|
public Spider thread(int threadNum) {
|
||||||
checkIfNotRunning();
|
checkIfRunning();
|
||||||
this.threadNum = threadNum;
|
this.threadNum = threadNum;
|
||||||
if (threadNum <= 0) {
|
if (threadNum <= 0) {
|
||||||
throw new IllegalArgumentException("threadNum should be more than one!");
|
throw new IllegalArgumentException("threadNum should be more than one!");
|
||||||
|
@ -377,9 +393,10 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* switch off xsoup
|
* switch off xsoup
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static void xsoupOff(){
|
public static void xsoupOff() {
|
||||||
EnvironmentUtil.setUseXsoup(false);
|
EnvironmentUtil.setUseXsoup(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,22 +2,30 @@ package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* @since 0.3.0
|
* @since 0.3.0
|
||||||
*/
|
*/
|
||||||
public abstract class BaseElementSelector implements Selector,ElementSelector {
|
public abstract class BaseElementSelector implements Selector, ElementSelector {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String select(String text) {
|
public String select(String text) {
|
||||||
return select(Jsoup.parse(text));
|
if (text != null) {
|
||||||
|
return select(Jsoup.parse(text));
|
||||||
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(String text) {
|
public List<String> selectList(String text) {
|
||||||
return selectList(Jsoup.parse(text));
|
if (text != null) {
|
||||||
|
return selectList(Jsoup.parse(text));
|
||||||
|
} else {
|
||||||
|
return new ArrayList<String>();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,91 +0,0 @@
|
||||||
package us.codecraft.webmagic.selector;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import java.lang.reflect.Constructor;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Selector factory with some inner cache.<br>
|
|
||||||
*
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* @since 0.1.0
|
|
||||||
*/
|
|
||||||
public class SelectorFactory {
|
|
||||||
|
|
||||||
private Map<String, Selector> innerCache = new ConcurrentHashMap<String, Selector>();
|
|
||||||
|
|
||||||
private static final SelectorFactory INSTATNCE = new SelectorFactory();
|
|
||||||
|
|
||||||
public static SelectorFactory getInstatnce() {
|
|
||||||
return INSTATNCE;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RegexSelector newRegexSelector(String regex) {
|
|
||||||
return newSelector(RegexSelector.class, regex);
|
|
||||||
}
|
|
||||||
|
|
||||||
public RegexSelector newRegexSelector(String regex, int group) {
|
|
||||||
String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
|
|
||||||
if (innerCache.get(cacheKey) != null) {
|
|
||||||
return (RegexSelector) innerCache.get(cacheKey);
|
|
||||||
}
|
|
||||||
return new RegexSelector(regex, group);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
|
|
||||||
return newSelector(ReplaceSelector.class, regex, replacement);
|
|
||||||
}
|
|
||||||
|
|
||||||
public XpathSelector newXpathSelector(String xpath) {
|
|
||||||
return newSelector(XpathSelector.class, xpath);
|
|
||||||
}
|
|
||||||
|
|
||||||
public SmartContentSelector newSmartContentSelector() {
|
|
||||||
return newSelector(SmartContentSelector.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T extends Selector> T newAndCacheSelector(Class<T> clazz, String... param) {
|
|
||||||
String cacheKey = getCacheKey(RegexSelector.class, param);
|
|
||||||
if (innerCache.get(cacheKey) != null) {
|
|
||||||
return (T) innerCache.get(cacheKey);
|
|
||||||
}
|
|
||||||
T selector = newSelector(clazz, param);
|
|
||||||
if (selector != null) {
|
|
||||||
innerCache.put(cacheKey, selector);
|
|
||||||
}
|
|
||||||
return selector;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T extends Selector> T newSelector(Class<T> clazz, String... param) {
|
|
||||||
try {
|
|
||||||
if (param.length == 0) {
|
|
||||||
Constructor<T> constructor
|
|
||||||
= clazz.getConstructor();
|
|
||||||
T selector = constructor.newInstance();
|
|
||||||
return selector;
|
|
||||||
} else if (param.length == 1) {
|
|
||||||
Constructor<T> constructor
|
|
||||||
= clazz.getConstructor(String.class);
|
|
||||||
T selector = constructor.newInstance(param[0]);
|
|
||||||
return selector;
|
|
||||||
} else if (param.length == 2) {
|
|
||||||
Constructor<T> constructor
|
|
||||||
= clazz.getConstructor(String.class, String.class);
|
|
||||||
T selector = constructor.newInstance(param[0], param[1]);
|
|
||||||
return selector;
|
|
||||||
} else {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new IllegalArgumentException("init object error", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getCacheKey(Class<?> clazz, String... param) {
|
|
||||||
return clazz.toString() + "_" + StringUtils.join(param, "_");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -2,6 +2,8 @@ package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
@ -18,47 +20,33 @@ public class UrlUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* canonicalizeUrl
|
* canonicalizeUrl
|
||||||
|
*
|
||||||
|
* Borrowed from Jsoup.
|
||||||
|
*
|
||||||
* @param url
|
* @param url
|
||||||
* @param refer
|
* @param refer
|
||||||
* @return canonicalizeUrl
|
* @return canonicalizeUrl
|
||||||
*/
|
*/
|
||||||
public static String canonicalizeUrl(String url, String refer) {
|
public static String canonicalizeUrl(String url, String refer) {
|
||||||
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
|
URL base;
|
||||||
return url;
|
try {
|
||||||
}
|
try {
|
||||||
if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) {
|
base = new URL(refer);
|
||||||
return url;
|
} catch (MalformedURLException e) {
|
||||||
}
|
// the base is unsuitable, but the attribute may be abs on its own, so try that
|
||||||
if (StringUtils.startsWith(url, "/")) {
|
URL abs = new URL(refer);
|
||||||
String host = getHost(refer);
|
return abs.toExternalForm();
|
||||||
return host + url;
|
|
||||||
} else if (!StringUtils.startsWith(url, ".")) {
|
|
||||||
refer = reversePath(refer, 1);
|
|
||||||
return refer + "/" + url;
|
|
||||||
} else {
|
|
||||||
Matcher matcher = relativePathPattern.matcher(url);
|
|
||||||
if (matcher.find()) {
|
|
||||||
int reverseDepth = matcher.group(1).length();
|
|
||||||
refer = reversePath(refer, reverseDepth);
|
|
||||||
String substring = StringUtils.substring(url, matcher.end());
|
|
||||||
return refer + "/" + substring;
|
|
||||||
} else {
|
|
||||||
refer = reversePath(refer, 1);
|
|
||||||
return refer + "/" + url;
|
|
||||||
}
|
}
|
||||||
|
// workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
|
||||||
|
if (url.startsWith("?"))
|
||||||
|
url = base.getPath() + url;
|
||||||
|
URL abs = new URL(base, url);
|
||||||
|
return abs.toExternalForm();
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String reversePath(String url, int depth) {
|
|
||||||
int i = StringUtils.lastOrdinalIndexOf(url, "/", depth);
|
|
||||||
if (i < 10) {
|
|
||||||
url = getHost(url);
|
|
||||||
} else {
|
|
||||||
url = StringUtils.substring(url, 0, i);
|
|
||||||
}
|
|
||||||
return url;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String getHost(String url) {
|
public static String getHost(String url) {
|
||||||
String host = url;
|
String host = url;
|
||||||
int i = StringUtils.ordinalIndexOf(url, "/", 3);
|
int i = StringUtils.ordinalIndexOf(url, "/", 3);
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import org.junit.Ignore;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class SpiderTest {
|
||||||
|
|
||||||
|
@Ignore("long time")
|
||||||
|
@Test
|
||||||
|
public void testStartAndStop() throws InterruptedException {
|
||||||
|
Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() {
|
||||||
|
@Override
|
||||||
|
public void process(ResultItems resultItems, Task task) {
|
||||||
|
System.out.println(1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
spider.start();
|
||||||
|
Thread.sleep(10000);
|
||||||
|
spider.stop();
|
||||||
|
// spider.run();
|
||||||
|
Thread.sleep(10000);
|
||||||
|
}
|
||||||
|
}
|
|
@ -19,13 +19,12 @@ public class UrlUtilsTest {
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
||||||
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
|
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
|
||||||
|
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com");
|
|
||||||
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com");
|
|
||||||
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
||||||
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
|
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
|
||||||
|
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/");
|
||||||
|
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
|
||||||
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
|
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
|
||||||
|
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.3.0</version>
|
<version>0.3.1</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -184,7 +184,7 @@ class PageModelExtractor {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
if (objectExtractor == null) {
|
if (objectExtractor == null) {
|
||||||
return processSingle(page, null, false);
|
return processSingle(page, null, true);
|
||||||
} else {
|
} else {
|
||||||
if (objectExtractor.multi) {
|
if (objectExtractor.multi) {
|
||||||
List<Object> os = new ArrayList<Object>();
|
List<Object> os = new ArrayList<Object>();
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,14 @@
|
||||||
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
import us.codecraft.webmagic.model.PageModelPipeline;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class MockPageModelPipeline implements PageModelPipeline{
|
||||||
|
@Override
|
||||||
|
public void process(Object o, Task task) {
|
||||||
|
Assert.assertNotNull(o);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,13 @@
|
||||||
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class MockPipeline implements Pipeline{
|
||||||
|
@Override
|
||||||
|
public void process(ResultItems resultItems, Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,87 @@
|
||||||
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.MockDownloader;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
|
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
|
||||||
|
import us.codecraft.webmagic.model.annotation.HelpUrl;
|
||||||
|
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
*/
|
||||||
|
@TargetUrl("https://github.com/\\w+/\\w+")
|
||||||
|
@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
|
||||||
|
public class GithubRepo implements HasKey {
|
||||||
|
|
||||||
|
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
|
||||||
|
private String name;
|
||||||
|
|
||||||
|
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
|
||||||
|
private String author;
|
||||||
|
|
||||||
|
@ExtractBy("//div[@id='readme']")
|
||||||
|
private String readme;
|
||||||
|
|
||||||
|
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true)
|
||||||
|
private List<String> language;
|
||||||
|
|
||||||
|
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()")
|
||||||
|
private String star;
|
||||||
|
|
||||||
|
@ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()")
|
||||||
|
private String fork;
|
||||||
|
|
||||||
|
@ExtractByUrl
|
||||||
|
private String url;
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0)
|
||||||
|
, new PageModelPipeline<GithubRepo>() {
|
||||||
|
@Override
|
||||||
|
public void process(GithubRepo o, Task task) {
|
||||||
|
Assert.assertEquals("78",o.getStar().trim());
|
||||||
|
Assert.assertEquals("65",o.getFork().trim());
|
||||||
|
}
|
||||||
|
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String key() {
|
||||||
|
return author + ":" + name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getReadme() {
|
||||||
|
return readme;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAuthor() {
|
||||||
|
return author;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getLanguage() {
|
||||||
|
return language;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getUrl() {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getStar() {
|
||||||
|
return star;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFork() {
|
||||||
|
return fork;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
package us.codecraft.webmagic.processor;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.*;
|
||||||
|
import us.codecraft.webmagic.model.OOSpider;
|
||||||
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class GithubRepoProcessor implements PageProcessor {
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString());
|
||||||
|
page.putField("fork",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()").toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
|
||||||
|
@Override
|
||||||
|
public void process(ResultItems resultItems, Task task) {
|
||||||
|
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
|
||||||
|
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
|
||||||
|
}
|
||||||
|
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -5,7 +5,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.3.0</version>
|
<version>0.3.1</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -14,8 +14,6 @@ import java.util.Scanner;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-7 <br>
|
|
||||||
* Time: 下午9:24 <br>
|
|
||||||
*/
|
*/
|
||||||
public class QuickStarter {
|
public class QuickStarter {
|
||||||
|
|
||||||
|
|
|
@ -14,8 +14,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-10 <br>
|
|
||||||
* Time: 下午6:37 <br>
|
|
||||||
*/
|
*/
|
||||||
@TargetUrl("https://github.com/\\w+/\\w+")
|
@TargetUrl("https://github.com/\\w+/\\w+")
|
||||||
@HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"})
|
@HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"})
|
||||||
|
|
|
@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run();
|
OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTitle() {
|
public String getTitle() {
|
||||||
|
|
|
@ -10,8 +10,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-11 <br>
|
|
||||||
* Time: 下午9:29 <br>
|
|
||||||
*/
|
*/
|
||||||
@TargetUrl("http://www.36kr.com/p/\\d+.html")
|
@TargetUrl("http://www.36kr.com/p/\\d+.html")
|
||||||
@HelpUrl("http://www.36kr.com/#/page/\\d+")
|
@HelpUrl("http://www.36kr.com/#/page/\\d+")
|
||||||
|
|
|
@ -16,8 +16,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-4 <br>
|
|
||||||
* Time: 下午8:17 <br>
|
|
||||||
*/
|
*/
|
||||||
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
|
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
|
||||||
public class News163 implements MultiPageModel {
|
public class News163 implements MultiPageModel {
|
||||||
|
|
|
@ -9,8 +9,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-3 <br>
|
|
||||||
* Time: 下午8:25 <br>
|
|
||||||
*/
|
*/
|
||||||
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
|
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
|
||||||
@HelpUrl("http://www.oschina.net/question/*")
|
@HelpUrl("http://www.oschina.net/question/*")
|
||||||
|
|
|
@ -11,8 +11,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-2 <br>
|
|
||||||
* Time: 上午7:52 <br>
|
|
||||||
*/
|
*/
|
||||||
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
||||||
public class OschinaBlog implements HasKey{
|
public class OschinaBlog implements HasKey{
|
||||||
|
|
|
@ -8,8 +8,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
|
||||||
* Time: 下午8:08
|
|
||||||
*/
|
*/
|
||||||
public class DiandianBlogProcessor implements PageProcessor {
|
public class DiandianBlogProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
|
|
@ -9,8 +9,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
|
||||||
* Time: 下午8:08
|
|
||||||
*/
|
*/
|
||||||
public class HuxiuProcessor implements PageProcessor {
|
public class HuxiuProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
|
@ -18,13 +16,16 @@ public class HuxiuProcessor implements PageProcessor {
|
||||||
List<String> requests = page.getHtml().links().regex(".*article.*").all();
|
List<String> requests = page.getHtml().links().regex(".*article.*").all();
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
|
page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
|
||||||
page.putField("content",page.getHtml().smartContent());
|
page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
|
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/");
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new HuxiuProcessor()).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
|
|
|
@ -10,8 +10,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
|
||||||
* Time: 下午8:08
|
|
||||||
*/
|
*/
|
||||||
public class InfoQMiniBookProcessor implements PageProcessor {
|
public class InfoQMiniBookProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
|
|
@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-7-26 <br>
|
|
||||||
* Time: 上午7:31 <br>
|
|
||||||
*/
|
*/
|
||||||
public class IteyeBlogProcessor implements PageProcessor {
|
public class IteyeBlogProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
@ -24,8 +22,7 @@ public class IteyeBlogProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site == null) {
|
if (site == null) {
|
||||||
site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/").
|
site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/");
|
||||||
setSleepTime(100).setRetryTimes(3);
|
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,6 @@ public class NjuBBSProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures").
|
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures");
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,8 +9,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
|
||||||
* Time: 下午1:48
|
|
||||||
*/
|
*/
|
||||||
public class OschinaBlogPageProcesser implements PageProcessor {
|
public class OschinaBlogPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
|
|
|
@ -8,8 +8,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
|
||||||
* Time: 下午1:48
|
|
||||||
*/
|
*/
|
||||||
public class OschinaPageProcesser implements PageProcessor {
|
public class OschinaPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
|
|
|
@ -8,8 +8,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
|
||||||
* Time: 下午8:08
|
|
||||||
*/
|
*/
|
||||||
public class QzoneBlogProcessor implements PageProcessor {
|
public class QzoneBlogProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
|
||||||
* Time: 下午1:48
|
|
||||||
*/
|
*/
|
||||||
public class SinaBlogProcesser implements PageProcessor {
|
public class SinaBlogProcesser implements PageProcessor {
|
||||||
|
|
||||||
|
|
|
@ -8,8 +8,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
|
||||||
* Time: 下午1:48
|
|
||||||
*/
|
*/
|
||||||
public class TianyaPageProcesser implements PageProcessor {
|
public class TianyaPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
|
|
|
@ -1,8 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
touch wordpress.xml
|
|
||||||
cat wp-head.xml >> wordpress.xml
|
|
||||||
for f in `ls`;
|
|
||||||
do
|
|
||||||
cat ${f} >> ../wordpress.xml
|
|
||||||
done;
|
|
||||||
cat wp-bottom.xml >> wordpress.xml
|
|
|
@ -1,22 +0,0 @@
|
||||||
<item>
|
|
||||||
<title>${title}</title>
|
|
||||||
<link>http://127.0.0.1/wordpress/?p=${id}</link>
|
|
||||||
<pubDate>${date}</pubDate>
|
|
||||||
<dc:creator>admin</dc:creator>
|
|
||||||
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${id}</guid>
|
|
||||||
<description></description>
|
|
||||||
<content:encoded><![CDATA[${content}]]></content:encoded>
|
|
||||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
|
||||||
<wp:post_id>${id}</wp:post_id>
|
|
||||||
<wp:post_date>${date}</wp:post_date>
|
|
||||||
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
|
||||||
<wp:comment_status>open</wp:comment_status>
|
|
||||||
<wp:ping_status>open</wp:ping_status>
|
|
||||||
<wp:post_name>${title}</wp:post_name>
|
|
||||||
<wp:status>publish</wp:status>
|
|
||||||
<wp:post_parent>0</wp:post_parent>
|
|
||||||
<wp:menu_order>0</wp:menu_order>
|
|
||||||
<wp:post_type>post</wp:post_type>
|
|
||||||
<wp:post_password></wp:post_password>
|
|
||||||
<wp:is_sticky>0</wp:is_sticky>
|
|
||||||
</item>
|
|
|
@ -1,2 +0,0 @@
|
||||||
</channel>
|
|
||||||
</rss>
|
|
|
@ -1,35 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8" ?>
|
|
||||||
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
|
|
||||||
<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
|
|
||||||
<!-- You may use this file to transfer that content from one site to another. -->
|
|
||||||
<!-- This file is not intended to serve as a complete backup of your site. -->
|
|
||||||
|
|
||||||
<!-- To import this information into a WordPress site follow these steps: -->
|
|
||||||
<!-- 1. Log in to that site as an administrator. -->
|
|
||||||
<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
|
|
||||||
<!-- 3. Install the "WordPress" importer from the list. -->
|
|
||||||
<!-- 4. Activate & Run Importer. -->
|
|
||||||
<!-- 5. Upload this file using the form provided on that page. -->
|
|
||||||
<!-- 6. You will first be asked to map the authors in this export file to users -->
|
|
||||||
<!-- on the site. For each author, you may choose to map to an -->
|
|
||||||
<!-- existing user on the site or to create a new user. -->
|
|
||||||
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
|
|
||||||
<!-- contained in this file into your site. -->
|
|
||||||
|
|
||||||
<!-- generator="WordPress/3.3.1" created="2012-06-10 09:15" -->
|
|
||||||
<rss version="2.0"
|
|
||||||
xmlns:excerpt="http://wordpress.org/export/1.1/excerpt/"
|
|
||||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
|
||||||
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
|
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
||||||
xmlns:wp="http://wordpress.org/export/1.1/"
|
|
||||||
>
|
|
||||||
<channel>
|
|
||||||
<wp:wxr_version>1.1</wp:wxr_version>
|
|
||||||
<wp:base_site_url>http://127.0.0.1/wordpress</wp:base_site_url>
|
|
||||||
<wp:base_blog_url>http://127.0.0.1/wordpress</wp:base_blog_url>
|
|
||||||
|
|
||||||
<wp:author><wp:author_id>1</wp:author_id><wp:author_login>admin</wp:author_login><wp:author_email>flashsword20@163.com</wp:author_email><wp:author_display_name><![CDATA[admin]]></wp:author_display_name><wp:author_first_name><![CDATA[]]></wp:author_first_name><wp:author_last_name><![CDATA[]]></wp:author_last_name></wp:author>
|
|
||||||
|
|
||||||
|
|
||||||
<generator>http://wordpress.org/?v=3.3.1</generator>
|
|
|
@ -1,28 +0,0 @@
|
||||||
package us.codecraft.webmagic.processor;
|
|
||||||
|
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
|
||||||
import us.codecraft.webmagic.Spider;
|
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
|
||||||
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
|
||||||
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
|
||||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* Date: 13-6-9
|
|
||||||
* Time: 上午8:02
|
|
||||||
*/
|
|
||||||
public class DiaoyuwengProcessorTest {
|
|
||||||
|
|
||||||
@Ignore
|
|
||||||
@Test
|
|
||||||
public void test() throws IOException {
|
|
||||||
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
|
||||||
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
|
|
||||||
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
|
||||||
run();
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue