diff --git a/.gitignore b/.gitignore index cd33b61..8e88e25 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ target/* *.iml out/ +.idea diff --git a/pom.xml b/pom.xml index 72e04ae..851a9e7 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.3.0 + 0.3.1 4.0.0 pom @@ -109,6 +109,14 @@ + + org.apache.maven.plugins + maven-surefire-plugin + + pertest + -Xms1024m -Xmx1024m -Xss1m + + org.apache.maven.plugins maven-compiler-plugin diff --git a/release.properties b/release.properties deleted file mode 100644 index 86e7224..0000000 --- a/release.properties +++ /dev/null @@ -1,11 +0,0 @@ -#release configuration -#Tue Aug 20 23:36:56 CST 2013 -scm.tagNameFormat=@{project.artifactId}-@{project.version} -pushChanges=true -scm.url=scm\:git\:git@github.com\:code4craft/webmagic.git -preparationGoals=clean verify -remoteTagging=true -scm.commentPrefix=[maven-release-plugin] -exec.additionalArguments=-Psonatype-oss-release -P development -exec.snapshotReleasePluginAllowed=false -completedPhase=check-poms diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 76526a8..f8e35d8 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.0 + 0.3.1 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 47cefd0..40fb70d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -79,22 +79,22 @@ public class Spider implements Runnable, Task { * create a spider with pageProcessor. * * @param pageProcessor + * @return new spider + * @see PageProcessor */ - public Spider(PageProcessor pageProcessor) { - this.pageProcessor = pageProcessor; - this.site = pageProcessor.getSite(); - this.startUrls = pageProcessor.getSite().getStartUrls(); + public static Spider create(PageProcessor pageProcessor) { + return new Spider(pageProcessor); } /** * create a spider with pageProcessor. * * @param pageProcessor - * @return new spider - * @see PageProcessor */ - public static Spider create(PageProcessor pageProcessor) { - return new Spider(pageProcessor); + public Spider(PageProcessor pageProcessor) { + this.pageProcessor = pageProcessor; + this.site = pageProcessor.getSite(); + this.startUrls = pageProcessor.getSite().getStartUrls(); } /** @@ -105,7 +105,7 @@ public class Spider implements Runnable, Task { * @return this */ public Spider startUrls(List startUrls) { - checkIfNotRunning(); + checkIfRunning(); this.startUrls = startUrls; return this; } @@ -139,11 +139,11 @@ public class Spider implements Runnable, Task { * * @param scheduler * @return this - * @since 0.2.1 * @see Scheduler + * @since 0.2.1 */ public Spider setScheduler(Scheduler scheduler) { - checkIfNotRunning(); + checkIfRunning(); this.scheduler = scheduler; return this; } @@ -153,8 +153,8 @@ public class Spider implements Runnable, Task { * * @param pipeline * @return this - * @deprecated * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline) + * @deprecated */ public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); @@ -165,11 +165,11 @@ public class Spider implements Runnable, Task { * * @param pipeline * @return this - * @since 0.2.1 * @see Pipeline + * @since 0.2.1 */ public Spider addPipeline(Pipeline pipeline) { - checkIfNotRunning(); + checkIfRunning(); this.pipelines.add(pipeline); return this; } @@ -189,8 +189,8 @@ public class Spider implements Runnable, Task { * * @param downloader * @return this - * @deprecated * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) + * @deprecated */ public Spider downloader(Downloader downloader) { return setDownloader(downloader); @@ -198,12 +198,13 @@ public class Spider implements Runnable, Task { /** * set the downloader of spider - * @see Downloader + * * @param downloader * @return this + * @see Downloader */ public Spider setDownloader(Downloader downloader) { - checkIfNotRunning(); + checkIfRunning(); this.downloader = downloader; return this; } @@ -220,7 +221,8 @@ public class Spider implements Runnable, Task { @Override public void run() { - if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) { + if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) + && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) { throw new IllegalStateException("Spider is already running!"); } checkComponent(); @@ -228,18 +230,19 @@ public class Spider implements Runnable, Task { for (String startUrl : startUrls) { scheduler.push(new Request(startUrl), this); } + startUrls.clear(); } Request request = scheduler.poll(this); - //singel thread + //single thread if (executorService == null) { - while (request != null) { + while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { processRequest(request); request = scheduler.poll(this); } } else { //multi thread final AtomicInteger threadAlive = new AtomicInteger(0); - while (true) { + while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { if (request == null) { //when no request found but some thread is alive, sleep a while. try { @@ -311,7 +314,7 @@ public class Spider implements Runnable, Task { return; } //for cycle retry - if (page.getHtml()==null){ + if (page.getHtml() == null) { addRequest(page); sleep(site.getSleepTime()); return; @@ -342,8 +345,8 @@ public class Spider implements Runnable, Task { } } - protected void checkIfNotRunning() { - if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) { + protected void checkIfRunning() { + if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) { throw new IllegalStateException("Spider is already running!"); } } @@ -354,6 +357,19 @@ public class Spider implements Runnable, Task { thread.start(); } + public void start() { + runAsync(); + } + + public void stop() { + stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); + } + + public void stopAndDestroy() { + stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); + destroy(); + } + /** * start with more than one threads * @@ -361,7 +377,7 @@ public class Spider implements Runnable, Task { * @return this */ public Spider thread(int threadNum) { - checkIfNotRunning(); + checkIfRunning(); this.threadNum = threadNum; if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); @@ -377,9 +393,10 @@ public class Spider implements Runnable, Task { /** * switch off xsoup + * * @return */ - public static void xsoupOff(){ + public static void xsoupOff() { EnvironmentUtil.setUseXsoup(false); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index e313f24..7d9035f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -2,22 +2,30 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; +import java.util.ArrayList; import java.util.List; /** * @author code4crafter@gmail.com * @since 0.3.0 */ -public abstract class BaseElementSelector implements Selector,ElementSelector { +public abstract class BaseElementSelector implements Selector, ElementSelector { @Override public String select(String text) { - return select(Jsoup.parse(text)); + if (text != null) { + return select(Jsoup.parse(text)); + } + return null; } @Override public List selectList(String text) { - return selectList(Jsoup.parse(text)); + if (text != null) { + return selectList(Jsoup.parse(text)); + } else { + return new ArrayList(); + } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java deleted file mode 100644 index 8a0c76c..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ /dev/null @@ -1,91 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.apache.commons.lang3.StringUtils; - -import java.lang.reflect.Constructor; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; - -/** - * Selector factory with some inner cache.
- * - * @author code4crafter@gmail.com
- * @since 0.1.0 - */ -public class SelectorFactory { - - private Map innerCache = new ConcurrentHashMap(); - - private static final SelectorFactory INSTATNCE = new SelectorFactory(); - - public static SelectorFactory getInstatnce() { - return INSTATNCE; - } - - public RegexSelector newRegexSelector(String regex) { - return newSelector(RegexSelector.class, regex); - } - - public RegexSelector newRegexSelector(String regex, int group) { - String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group)); - if (innerCache.get(cacheKey) != null) { - return (RegexSelector) innerCache.get(cacheKey); - } - return new RegexSelector(regex, group); - } - - public ReplaceSelector newReplaceSelector(String regex, String replacement) { - return newSelector(ReplaceSelector.class, regex, replacement); - } - - public XpathSelector newXpathSelector(String xpath) { - return newSelector(XpathSelector.class, xpath); - } - - public SmartContentSelector newSmartContentSelector() { - return newSelector(SmartContentSelector.class); - } - - public T newAndCacheSelector(Class clazz, String... param) { - String cacheKey = getCacheKey(RegexSelector.class, param); - if (innerCache.get(cacheKey) != null) { - return (T) innerCache.get(cacheKey); - } - T selector = newSelector(clazz, param); - if (selector != null) { - innerCache.put(cacheKey, selector); - } - return selector; - - } - - public T newSelector(Class clazz, String... param) { - try { - if (param.length == 0) { - Constructor constructor - = clazz.getConstructor(); - T selector = constructor.newInstance(); - return selector; - } else if (param.length == 1) { - Constructor constructor - = clazz.getConstructor(String.class); - T selector = constructor.newInstance(param[0]); - return selector; - } else if (param.length == 2) { - Constructor constructor - = clazz.getConstructor(String.class, String.class); - T selector = constructor.newInstance(param[0], param[1]); - return selector; - } else { - throw new UnsupportedOperationException(); - } - } catch (Exception e) { - throw new IllegalArgumentException("init object error", e); - } - } - - private String getCacheKey(Class clazz, String... param) { - return clazz.toString() + "_" + StringUtils.join(param, "_"); - } - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 4e1140b..4e5f67f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -2,6 +2,8 @@ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; +import java.net.MalformedURLException; +import java.net.URL; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -18,47 +20,33 @@ public class UrlUtils { /** * canonicalizeUrl + * + * Borrowed from Jsoup. + * * @param url * @param refer * @return canonicalizeUrl */ public static String canonicalizeUrl(String url, String refer) { - if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { - return url; - } - if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) { - return url; - } - if (StringUtils.startsWith(url, "/")) { - String host = getHost(refer); - return host + url; - } else if (!StringUtils.startsWith(url, ".")) { - refer = reversePath(refer, 1); - return refer + "/" + url; - } else { - Matcher matcher = relativePathPattern.matcher(url); - if (matcher.find()) { - int reverseDepth = matcher.group(1).length(); - refer = reversePath(refer, reverseDepth); - String substring = StringUtils.substring(url, matcher.end()); - return refer + "/" + substring; - } else { - refer = reversePath(refer, 1); - return refer + "/" + url; + URL base; + try { + try { + base = new URL(refer); + } catch (MalformedURLException e) { + // the base is unsuitable, but the attribute may be abs on its own, so try that + URL abs = new URL(refer); + return abs.toExternalForm(); } + // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired + if (url.startsWith("?")) + url = base.getPath() + url; + URL abs = new URL(base, url); + return abs.toExternalForm(); + } catch (MalformedURLException e) { + return ""; } } - public static String reversePath(String url, int depth) { - int i = StringUtils.lastOrdinalIndexOf(url, "/", depth); - if (i < 10) { - url = getHost(url); - } else { - url = StringUtils.substring(url, 0, i); - } - return url; - } - public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java new file mode 100644 index 0000000..b3249ce --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.processor.SimplePageProcessor; + +/** + * @author code4crafter@gmail.com + */ +public class SpiderTest { + + @Ignore("long time") + @Test + public void testStartAndStop() throws InterruptedException { + Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() { + @Override + public void process(ResultItems resultItems, Task task) { + System.out.println(1); + } + }); + spider.start(); + Thread.sleep(10000); + spider.stop(); +// spider.run(); + Thread.sleep(10000); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index d1cbc21..abe6adc 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -19,13 +19,12 @@ public class UrlUtilsTest { fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); + fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/"); + Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); + Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); } @Test diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index e58fa02..098bc94 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.0 + 0.3.1 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 03cd3a3..54d942c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -184,7 +184,7 @@ class PageModelExtractor { return null; } if (objectExtractor == null) { - return processSingle(page, null, false); + return processSingle(page, null, true); } else { if (objectExtractor.multi) { List os = new ArrayList(); diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java new file mode 100644 index 0000000..8114b04 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java @@ -0,0 +1,1130 @@ +package us.codecraft.webmagic; + +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * @author code4crafter@gmail.com + */ +public class MockDownloader implements Downloader{ + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " code4craft/webmagic\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + " This repository\n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
This repository
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
All repositories
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "
    \n" + + "\n" + + "
  • \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + " 23\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " Unwatch\n" + + " \n" + + " \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + " Notification status\n" + + " \n" + + "
    \n" + + "\n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "

    Not watching

    \n" + + " You only receive notifications for discussions in which you participate or are @mentioned.\n" + + " \n" + + " \n" + + " Watch\n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "

    Watching

    \n" + + " You receive notifications for all discussions in this repository.\n" + + " \n" + + " \n" + + " Unwatch\n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "

    Ignoring

    \n" + + " You do not receive any notifications for discussions in this repository.\n" + + " \n" + + " \n" + + " Stop ignoring\n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "
  • \n" + + "\n" + + "
  • \n" + + " \n" + + "
    \n" + + " \n" + + " Unstar\n" + + " \n" + + " \n" + + " Star\n" + + " \n" + + " 78\n" + + "
    \n" + + "\n" + + "
  • \n" + + "\n" + + "\n" + + "
  • \n" + + " \n" + + " Fork\n" + + " \n" + + " 65\n" + + "
  • \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "

\n" + + " public\n" + + " \n" + + " \n" + + " code4craft/webmagic\n" + + "\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + "\n" + + "

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

HTTPS clone URL

\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

SSH clone URL

\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

Subversion checkout URL

\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "

You can clone with\n" + + " HTTPS,\n" + + " SSH,\n" + + " Subversion,\n" + + " and other methods.\n" + + "

\n" + + "\n" + + " \n" + + " \n" + + " Clone in Desktop\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " Download ZIP\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "

A scalable web crawler framework.

\n" + + "
\n" + + "\n" + + "\n" + + " Edit\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "\n" + + " \n" + + " or cancel\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + " \n" + + " \n" + + " Java\n" + + " 100.0%\n" + + " \n" + + "
  2. \n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + " Java\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " branch:\n" + + " master\n" + + " \n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + " Switch branches/tags\n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
    \n" + + "
  • \n" + + " Branches\n" + + "
  • \n" + + "
  • \n" + + " Tags\n" + + "
  • \n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " en-webmagic\n" + + "
\n" + + "
\n" + + " \n" + + " gh-pages\n" + + "
\n" + + "
\n" + + " \n" + + " master\n" + + "
\n" + + "
\n" + + " \n" + + " xsoup\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "

Create branch:

\n" + + " from ‘master’\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " webmagic-parent-0.2.1\n" + + "
\n" + + "
\n" + + " \n" + + " webmagic-0.3.0\n" + + "
\n" + + "
\n" + + " \n" + + " version-0.2.0\n" + + "
\n" + + "
\n" + + " \n" + + " version-0.1.0\n" + + "
\n" + + "
\n" + + "\n" + + "
Nothing to show
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "Show File Finder\n" + + "
\n" + + " \n" + + " \n" + + "\n" + + "
\n" + + "

\n" + + " Fetching latest commit…\n" + + "

\n" + + "
\n" + + "

\"Octocat-spinner-32-eaf2f5\"

\n" + + "

Cannot retrieve the latest commit at this time

\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " en_docs\n" + + " \n" + + " update readme\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-core\n" + + " \n" + + " fix null pointe exception #26\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-extension\n" + + " \n" + + " fix null pointe exception #26\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-lucene\n" + + " \n" + + " update pom\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-samples\n" + + " \n" + + " update version for samples\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-saxon\n" + + " \n" + + " xsoup test\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-selenium\n" + + " \n" + + " update pom\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " zh_docs\n" + + " \n" + + " update version\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " .gitignore\n" + + " \n" + + " 增加剔除文件\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " .travis.yml\n" + + " \n" + + " add jdk\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " README.md\n" + + " \n" + + " update version\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " pom.xml\n" + + " \n" + + " 将单元测试fork独立的JVM来跑。避免少数情况默认maven开的JVM堆太小。\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " release-note.md\n" + + " \n" + + " release note\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic manual.md\n" + + " \n" + + " readme\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " README.md

\n" + + "webmagic

\n" + + "\n" + + "

Readme in Chinese

\n" + + "\n" + + "

\"Build

\n" + + "\n" + + "
\n" + + "

A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.

\n" + + "
\n" + + "\n" + + "

\n" + + "Features:

\n" + + "\n" + + "
    \n" + + "
  • Simple core with high flexibility.
  • \n" + + "
  • Simple API for html extracting.
  • \n" + + "
  • Annotation with POJO to customize a crawler, no configuration.
  • \n" + + "
  • Multi-thread and Distribution support.
  • \n" + + "
  • Easy to be integrated.
  • \n" + + "

\n" + + "Install:

\n" + + "\n" + + "

Add dependencies to your pom.xml:

\n" + + "\n" + + "
    <dependency>\n" +
+            "        <groupId>us.codecraft</groupId>\n" +
+            "        <artifactId>webmagic-core</artifactId>\n" +
+            "        <version>0.3.0</version>\n" +
+            "    </dependency>\n" +
+            "    <dependency>\n" +
+            "        <groupId>us.codecraft</groupId>\n" +
+            "        <artifactId>webmagic-extension</artifactId>\n" +
+            "        <version>0.3.0</version>\n" +
+            "    </dependency>\n" +
+            "
\n" + + "\n" + + "

\n" + + "Get Started:

\n" + + "\n" + + "

\n" + + "First crawler:

\n" + + "\n" + + "

Write a class implements PageProcessor:

\n" + + "\n" + + "
    public class OschinaBlogPageProcesser implements PageProcessor {\n" +
+            "\n" +
+            "        private Site site = Site.me().setDomain(\"my.oschina.net\")\n" +
+            "           .addStartUrl(\"http://my.oschina.net/flashsword/blog\");\n" +
+            "\n" +
+            "        @Override\n" +
+            "        public void process(Page page) {\n" +
+            "            List<String> links = page.getHtml().links().regex(\"http://my\\\\.oschina\\\\.net/flashsword/blog/\\\\d+\").all();\n" +
+            "            page.addTargetRequests(links);\n" +
+            "            page.putField(\"title\", page.getHtml().xpath(\"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1\").toString());\n" +
+            "            page.putField(\"content\", page.getHtml().$(\"div.content\").toString());\n" +
+            "            page.putField(\"tags\",page.getHtml().xpath(\"//div[@class='BlogTags']/a/text()\").all());\n" +
+            "        }\n" +
+            "\n" +
+            "        @Override\n" +
+            "        public Site getSite() {\n" +
+            "            return site;\n" +
+            "\n" +
+            "        }\n" +
+            "\n" +
+            "        public static void main(String[] args) {\n" +
+            "            Spider.create(new OschinaBlogPageProcesser())\n" +
+            "                 .pipeline(new ConsolePipeline()).run();\n" +
+            "        }\n" +
+            "    }\n" +
+            "
\n" + + "\n" + + "
    \n" + + "
  • \n" + + "

    page.addTargetRequests(links)

    \n" + + "\n" + + "

    Add urls for crawling.

    \n" + + "
  • \n" + + "

You can also use annotation way:

\n" + + "\n" + + "
    @TargetUrl(\"http://my.oschina.net/flashsword/blog/\\\\d+\")\n" +
+            "    public class OschinaBlog {\n" +
+            "\n" +
+            "        @ExtractBy(\"//title\")\n" +
+            "        private String title;\n" +
+            "\n" +
+            "        @ExtractBy(value = \"div.BlogContent\",type = ExtractBy.Type.Css)\n" +
+            "        private String content;\n" +
+            "\n" +
+            "        @ExtractBy(value = \"//div[@class='BlogTags']/a/text()\", multi = true)\n" +
+            "        private List<String> tags;\n" +
+            "\n" +
+            "        public static void main(String[] args) {\n" +
+            "            OOSpider.create(\n" +
+            "                Site.me().addStartUrl(\"http://my.oschina.net/flashsword/blog\"),\n" +
+            "                new ConsolePageModelPipeline(), OschinaBlog.class).run();\n" +
+            "        }\n" +
+            "    }\n" +
+            "
\n" + + "\n" + + "

\n" + + "Docs and samples:

\n" + + "\n" + + "

The architecture of webmagic (refered to Scrapy)

\n" + + "\n" + + "

\"image\"

\n" + + "\n" + + "

Javadocs: http://code4craft.github.io/webmagic/docs/en/

\n" + + "\n" + + "

There are some samples in webmagic-samples package.

\n" + + "\n" + + "

\n" + + "Lisence:

\n" + + "\n" + + "

Lisenced under Apache 2.0 lisence

\n" + + "\n" + + "

\n" + + "Thanks:

\n" + + "\n" + + "

To write webmagic, I refered to the projects below :

\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " Something went wrong with that request. Please try again.\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n"; + @Override + public Page download(Request request, Task task) { + Page page = new Page(); + page.setHtml(new Html(html)); + page.setRequest(new Request("https://github.com/code4craft/webmagic")); + page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); + return page; + } + + @Override + public void setThread(int threadNum) { + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java new file mode 100644 index 0000000..ea7601b --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic; + +import junit.framework.Assert; +import us.codecraft.webmagic.model.PageModelPipeline; + +/** + * @author code4crafter@gmail.com + */ +public class MockPageModelPipeline implements PageModelPipeline{ + @Override + public void process(Object o, Task task) { + Assert.assertNotNull(o); + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java new file mode 100644 index 0000000..7572c15 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic; + +import us.codecraft.webmagic.pipeline.Pipeline; + +/** + * @author code4crafter@gmail.com + */ +public class MockPipeline implements Pipeline{ + @Override + public void process(ResultItems resultItems, Task task) { + + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java new file mode 100644 index 0000000..5b6319a --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java @@ -0,0 +1,87 @@ +package us.codecraft.webmagic.model; + +import junit.framework.Assert; +import org.junit.Test; +import us.codecraft.webmagic.MockDownloader; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.HelpUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +@TargetUrl("https://github.com/\\w+/\\w+") +@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) +public class GithubRepo implements HasKey { + + @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) + private String name; + + @ExtractByUrl("https://github\\.com/(\\w+)/.*") + private String author; + + @ExtractBy("//div[@id='readme']") + private String readme; + + @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true) + private List language; + + @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") + private String star; + + @ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()") + private String fork; + + @ExtractByUrl + private String url; + + @Test + public void test() { + OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0) + , new PageModelPipeline() { + @Override + public void process(GithubRepo o, Task task) { + Assert.assertEquals("78",o.getStar().trim()); + Assert.assertEquals("65",o.getFork().trim()); + } + }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + } + + @Override + public String key() { + return author + ":" + name; + } + + public String getName() { + return name; + } + + public String getReadme() { + return readme; + } + + public String getAuthor() { + return author; + } + + public List getLanguage() { + return language; + } + + public String getUrl() { + return url; + } + + public String getStar() { + return star; + } + + public String getFork() { + return fork; + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java new file mode 100644 index 0000000..02b2ac1 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java @@ -0,0 +1,35 @@ +package us.codecraft.webmagic.processor; + +import junit.framework.Assert; +import org.junit.Test; +import us.codecraft.webmagic.*; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.pipeline.Pipeline; + +/** + * @author code4crafter@gmail.com + */ +public class GithubRepoProcessor implements PageProcessor { + @Override + public void process(Page page) { + page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString()); + page.putField("fork",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()").toString()); + } + + @Override + public Site getSite() { + return Site.me().addStartUrl("https://github.com/code4craft/webmagic"); + } + + @Test + public void test() { + OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() { + @Override + public void process(ResultItems resultItems, Task task) { + Assert.assertEquals("78",((String)resultItems.get("star")).trim()); + Assert.assertEquals("65",((String)resultItems.get("fork")).trim()); + } + }).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + } + +} diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 1c8af93..a63880f 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.0 + 0.3.1 4.0.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java index 69adabb..074dd0f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -14,8 +14,6 @@ import java.util.Scanner; /** * @author code4crafter@gmail.com
- * Date: 13-8-7
- * Time: 下午9:24
*/ public class QuickStarter { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index 79a20ff..e8998ec 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -14,8 +14,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-10
- * Time: 下午6:37
*/ @TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"}) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java index ae94525..7e3dc51 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{ } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run(); + OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run(); } public String getTitle() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index bba8d82..de3fdf5 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -10,8 +10,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
- * Date: 13-8-11
- * Time: 下午9:29
*/ @TargetUrl("http://www.36kr.com/p/\\d+.html") @HelpUrl("http://www.36kr.com/#/page/\\d+") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 946e737..e9dfb26 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -16,8 +16,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-4
- * Time: 下午8:17
*/ @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") public class News163 implements MultiPageModel { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java index e878633..112f86a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java @@ -9,8 +9,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
- * Date: 13-8-3
- * Time: 下午8:25
*/ @TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") @HelpUrl("http://www.oschina.net/question/*") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 96de977..7819b44 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -11,8 +11,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-2
- * Time: 上午7:52
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog implements HasKey{ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index a1189e4..25baa1f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class DiandianBlogProcessor implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 136eeb8..deae29e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -9,8 +9,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class HuxiuProcessor implements PageProcessor { @Override @@ -18,13 +16,16 @@ public class HuxiuProcessor implements PageProcessor { List requests = page.getHtml().links().regex(".*article.*").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); - page.putField("content",page.getHtml().smartContent()); + page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()")); } @Override public Site getSite() { - return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"); + } + + public static void main(String[] args) { + Spider.create(new HuxiuProcessor()).run(); } public static void main(String[] args) { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 38de3bc..3ef3957 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -10,8 +10,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class InfoQMiniBookProcessor implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index f80f895..26b85e8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 上午7:31
*/ public class IteyeBlogProcessor implements PageProcessor { @@ -24,8 +22,7 @@ public class IteyeBlogProcessor implements PageProcessor { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"). - setSleepTime(100).setRetryTimes(3); + site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"); } return site; } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index 2337da5..16dcb0c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -22,7 +22,6 @@ public class NjuBBSProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index e447003..ded1a5f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -9,8 +9,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class OschinaBlogPageProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index 522eb2c..b75cc83 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class OschinaPageProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index 49418b6..d9cee2b 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class QzoneBlogProcessor implements PageProcessor { @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index b4c5bc8..dcb6eff 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class SinaBlogProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index ecc55b4..d14b442 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class TianyaPageProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/resources/combine.sh b/webmagic-samples/src/main/resources/combine.sh deleted file mode 100644 index 0e7bd0c..0000000 --- a/webmagic-samples/src/main/resources/combine.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh -touch wordpress.xml -cat wp-head.xml >> wordpress.xml -for f in `ls`; - do - cat ${f} >> ../wordpress.xml - done; -cat wp-bottom.xml >> wordpress.xml \ No newline at end of file diff --git a/webmagic-samples/src/main/resources/ftl/wordpress.ftl b/webmagic-samples/src/main/resources/ftl/wordpress.ftl deleted file mode 100644 index f2feeb1..0000000 --- a/webmagic-samples/src/main/resources/ftl/wordpress.ftl +++ /dev/null @@ -1,22 +0,0 @@ - - ${title} - http://127.0.0.1/wordpress/?p=${id} - ${date} - admin - http://127.0.0.1/wordpress/?p=${id} - - - - ${id} - ${date} - ${date} - open - open - ${title} - publish - 0 - 0 - post - - 0 - diff --git a/webmagic-samples/src/main/resources/wp-bottom.xml b/webmagic-samples/src/main/resources/wp-bottom.xml deleted file mode 100644 index f651c3b..0000000 --- a/webmagic-samples/src/main/resources/wp-bottom.xml +++ /dev/null @@ -1,2 +0,0 @@ - - \ No newline at end of file diff --git a/webmagic-samples/src/main/resources/wp-head.xml b/webmagic-samples/src/main/resources/wp-head.xml deleted file mode 100644 index 8330ba1..0000000 --- a/webmagic-samples/src/main/resources/wp-head.xml +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - 1.1 - http://127.0.0.1/wordpress - http://127.0.0.1/wordpress - - 1adminflashsword20@163.com - - - http://wordpress.org/?v=3.3.1 diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java deleted file mode 100644 index 0371eb2..0000000 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package us.codecraft.webmagic.processor; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.pipeline.JsonFilePipeline; -import us.codecraft.webmagic.samples.DiaoyuwengProcessor; -import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; - -import java.io.IOException; - -/** - * @author code4crafter@gmail.com
- * Date: 13-6-9 - * Time: 上午8:02 - */ -public class DiaoyuwengProcessorTest { - - @Ignore - @Test - public void test() throws IOException { - DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); - JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); - Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). - run(); - } -}