diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java new file mode 100644 index 0000000..8114b04 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java @@ -0,0 +1,1130 @@ +package us.codecraft.webmagic; + +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * @author code4crafter@gmail.com + */ +public class MockDownloader implements Downloader{ + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " code4craft/webmagic\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + " This repository\n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
This repository
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
All repositories
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + "
    \n" + + "\n" + + "
  • \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + " 23\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " Unwatch\n" + + " \n" + + " \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + " Notification status\n" + + " \n" + + "
    \n" + + "\n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "

    Not watching

    \n" + + " You only receive notifications for discussions in which you participate or are @mentioned.\n" + + " \n" + + " \n" + + " Watch\n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "

    Watching

    \n" + + " You receive notifications for all discussions in this repository.\n" + + " \n" + + " \n" + + " Unwatch\n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + " \n" + + "
    \n" + + " \n" + + "

    Ignoring

    \n" + + " You do not receive any notifications for discussions in this repository.\n" + + " \n" + + " \n" + + " Stop ignoring\n" + + " \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "\n" + + "
    \n" + + "
    \n" + + "
    \n" + + "\n" + + "
    \n" + + "
  • \n" + + "\n" + + "
  • \n" + + " \n" + + "
    \n" + + " \n" + + " Unstar\n" + + " \n" + + " \n" + + " Star\n" + + " \n" + + " 78\n" + + "
    \n" + + "\n" + + "
  • \n" + + "\n" + + "\n" + + "
  • \n" + + " \n" + + " Fork\n" + + " \n" + + " 65\n" + + "
  • \n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "

\n" + + " public\n" + + " \n" + + " \n" + + " code4craft/webmagic\n" + + "\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + "\n" + + "

\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

HTTPS clone URL

\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

SSH clone URL

\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "

Subversion checkout URL

\n" + + "\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "

You can clone with\n" + + " HTTPS,\n" + + " SSH,\n" + + " Subversion,\n" + + " and other methods.\n" + + "

\n" + + "\n" + + " \n" + + " \n" + + " Clone in Desktop\n" + + " \n" + + "\n" + + "\n" + + " \n" + + " \n" + + " Download ZIP\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "

A scalable web crawler framework.

\n" + + "
\n" + + "\n" + + "\n" + + " Edit\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + "
\n" + + "\n" + + " \n" + + " or cancel\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + " \n" + + " \n" + + " Java\n" + + " 100.0%\n" + + " \n" + + "
  2. \n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + " \n" + + " Java\n" + + " \n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " branch:\n" + + " master\n" + + " \n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + " Switch branches/tags\n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
    \n" + + "
  • \n" + + " Branches\n" + + "
  • \n" + + "
  • \n" + + " Tags\n" + + "
  • \n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " en-webmagic\n" + + "
\n" + + "
\n" + + " \n" + + " gh-pages\n" + + "
\n" + + "
\n" + + " \n" + + " master\n" + + "
\n" + + "
\n" + + " \n" + + " xsoup\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "

Create branch:

\n" + + " from ‘master’\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " webmagic-parent-0.2.1\n" + + "
\n" + + "
\n" + + " \n" + + " webmagic-0.3.0\n" + + "
\n" + + "
\n" + + " \n" + + " version-0.2.0\n" + + "
\n" + + "
\n" + + " \n" + + " version-0.1.0\n" + + "
\n" + + "
\n" + + "\n" + + "
Nothing to show
\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "Show File Finder\n" + + "
\n" + + " \n" + + " \n" + + "\n" + + "
\n" + + "

\n" + + " Fetching latest commit…\n" + + "

\n" + + "
\n" + + "

\"Octocat-spinner-32-eaf2f5\"

\n" + + "

Cannot retrieve the latest commit at this time

\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " en_docs\n" + + " \n" + + " update readme\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-core\n" + + " \n" + + " fix null pointe exception #26\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-extension\n" + + " \n" + + " fix null pointe exception #26\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-lucene\n" + + " \n" + + " update pom\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-samples\n" + + " \n" + + " update version for samples\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-saxon\n" + + " \n" + + " xsoup test\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic-selenium\n" + + " \n" + + " update pom\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " zh_docs\n" + + " \n" + + " update version\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " .gitignore\n" + + " \n" + + " 增加剔除文件\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " .travis.yml\n" + + " \n" + + " add jdk\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " README.md\n" + + " \n" + + " update version\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " pom.xml\n" + + " \n" + + " 将单元测试fork独立的JVM来跑。避免少数情况默认maven开的JVM堆太小。\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " release-note.md\n" + + " \n" + + " release note\n" + + "
\n" + + " \n" + + " \"Octocat-spinner-32\"\n" + + " \n" + + " webmagic manual.md\n" + + " \n" + + " readme\n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + " README.md

\n" + + "webmagic

\n" + + "\n" + + "

Readme in Chinese

\n" + + "\n" + + "

\"Build

\n" + + "\n" + + "
\n" + + "

A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.

\n" + + "
\n" + + "\n" + + "

\n" + + "Features:

\n" + + "\n" + + "
    \n" + + "
  • Simple core with high flexibility.
  • \n" + + "
  • Simple API for html extracting.
  • \n" + + "
  • Annotation with POJO to customize a crawler, no configuration.
  • \n" + + "
  • Multi-thread and Distribution support.
  • \n" + + "
  • Easy to be integrated.
  • \n" + + "

\n" + + "Install:

\n" + + "\n" + + "

Add dependencies to your pom.xml:

\n" + + "\n" + + "
    <dependency>\n" +
+            "        <groupId>us.codecraft</groupId>\n" +
+            "        <artifactId>webmagic-core</artifactId>\n" +
+            "        <version>0.3.0</version>\n" +
+            "    </dependency>\n" +
+            "    <dependency>\n" +
+            "        <groupId>us.codecraft</groupId>\n" +
+            "        <artifactId>webmagic-extension</artifactId>\n" +
+            "        <version>0.3.0</version>\n" +
+            "    </dependency>\n" +
+            "
\n" + + "\n" + + "

\n" + + "Get Started:

\n" + + "\n" + + "

\n" + + "First crawler:

\n" + + "\n" + + "

Write a class implements PageProcessor:

\n" + + "\n" + + "
    public class OschinaBlogPageProcesser implements PageProcessor {\n" +
+            "\n" +
+            "        private Site site = Site.me().setDomain(\"my.oschina.net\")\n" +
+            "           .addStartUrl(\"http://my.oschina.net/flashsword/blog\");\n" +
+            "\n" +
+            "        @Override\n" +
+            "        public void process(Page page) {\n" +
+            "            List<String> links = page.getHtml().links().regex(\"http://my\\\\.oschina\\\\.net/flashsword/blog/\\\\d+\").all();\n" +
+            "            page.addTargetRequests(links);\n" +
+            "            page.putField(\"title\", page.getHtml().xpath(\"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1\").toString());\n" +
+            "            page.putField(\"content\", page.getHtml().$(\"div.content\").toString());\n" +
+            "            page.putField(\"tags\",page.getHtml().xpath(\"//div[@class='BlogTags']/a/text()\").all());\n" +
+            "        }\n" +
+            "\n" +
+            "        @Override\n" +
+            "        public Site getSite() {\n" +
+            "            return site;\n" +
+            "\n" +
+            "        }\n" +
+            "\n" +
+            "        public static void main(String[] args) {\n" +
+            "            Spider.create(new OschinaBlogPageProcesser())\n" +
+            "                 .pipeline(new ConsolePipeline()).run();\n" +
+            "        }\n" +
+            "    }\n" +
+            "
\n" + + "\n" + + "
    \n" + + "
  • \n" + + "

    page.addTargetRequests(links)

    \n" + + "\n" + + "

    Add urls for crawling.

    \n" + + "
  • \n" + + "

You can also use annotation way:

\n" + + "\n" + + "
    @TargetUrl(\"http://my.oschina.net/flashsword/blog/\\\\d+\")\n" +
+            "    public class OschinaBlog {\n" +
+            "\n" +
+            "        @ExtractBy(\"//title\")\n" +
+            "        private String title;\n" +
+            "\n" +
+            "        @ExtractBy(value = \"div.BlogContent\",type = ExtractBy.Type.Css)\n" +
+            "        private String content;\n" +
+            "\n" +
+            "        @ExtractBy(value = \"//div[@class='BlogTags']/a/text()\", multi = true)\n" +
+            "        private List<String> tags;\n" +
+            "\n" +
+            "        public static void main(String[] args) {\n" +
+            "            OOSpider.create(\n" +
+            "                Site.me().addStartUrl(\"http://my.oschina.net/flashsword/blog\"),\n" +
+            "                new ConsolePageModelPipeline(), OschinaBlog.class).run();\n" +
+            "        }\n" +
+            "    }\n" +
+            "
\n" + + "\n" + + "

\n" + + "Docs and samples:

\n" + + "\n" + + "

The architecture of webmagic (refered to Scrapy)

\n" + + "\n" + + "

\"image\"

\n" + + "\n" + + "

Javadocs: http://code4craft.github.io/webmagic/docs/en/

\n" + + "\n" + + "

There are some samples in webmagic-samples package.

\n" + + "\n" + + "

\n" + + "Lisence:

\n" + + "\n" + + "

Lisenced under Apache 2.0 lisence

\n" + + "\n" + + "

\n" + + "Thanks:

\n" + + "\n" + + "

To write webmagic, I refered to the projects below :

\n" + + "\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "
\n" + + "
\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + "\n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + "
\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + " \n" + + " Something went wrong with that request. Please try again.\n" + + "
\n" + + "\n" + + " \n" + + " \n" + + "\n" + + "\n"; + @Override + public Page download(Request request, Task task) { + Page page = new Page(); + page.setHtml(new Html(html)); + page.setRequest(new Request("https://github.com/code4craft/webmagic")); + page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); + return page; + } + + @Override + public void setThread(int threadNum) { + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java new file mode 100644 index 0000000..ea7601b --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic; + +import junit.framework.Assert; +import us.codecraft.webmagic.model.PageModelPipeline; + +/** + * @author code4crafter@gmail.com + */ +public class MockPageModelPipeline implements PageModelPipeline{ + @Override + public void process(Object o, Task task) { + Assert.assertNotNull(o); + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java new file mode 100644 index 0000000..7572c15 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic; + +import us.codecraft.webmagic.pipeline.Pipeline; + +/** + * @author code4crafter@gmail.com + */ +public class MockPipeline implements Pipeline{ + @Override + public void process(ResultItems resultItems, Task task) { + + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java new file mode 100644 index 0000000..5b6319a --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java @@ -0,0 +1,87 @@ +package us.codecraft.webmagic.model; + +import junit.framework.Assert; +import org.junit.Test; +import us.codecraft.webmagic.MockDownloader; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.HelpUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +@TargetUrl("https://github.com/\\w+/\\w+") +@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) +public class GithubRepo implements HasKey { + + @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) + private String name; + + @ExtractByUrl("https://github\\.com/(\\w+)/.*") + private String author; + + @ExtractBy("//div[@id='readme']") + private String readme; + + @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true) + private List language; + + @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") + private String star; + + @ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()") + private String fork; + + @ExtractByUrl + private String url; + + @Test + public void test() { + OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0) + , new PageModelPipeline() { + @Override + public void process(GithubRepo o, Task task) { + Assert.assertEquals("78",o.getStar().trim()); + Assert.assertEquals("65",o.getFork().trim()); + } + }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + } + + @Override + public String key() { + return author + ":" + name; + } + + public String getName() { + return name; + } + + public String getReadme() { + return readme; + } + + public String getAuthor() { + return author; + } + + public List getLanguage() { + return language; + } + + public String getUrl() { + return url; + } + + public String getStar() { + return star; + } + + public String getFork() { + return fork; + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java new file mode 100644 index 0000000..02b2ac1 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java @@ -0,0 +1,35 @@ +package us.codecraft.webmagic.processor; + +import junit.framework.Assert; +import org.junit.Test; +import us.codecraft.webmagic.*; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.pipeline.Pipeline; + +/** + * @author code4crafter@gmail.com + */ +public class GithubRepoProcessor implements PageProcessor { + @Override + public void process(Page page) { + page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString()); + page.putField("fork",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()").toString()); + } + + @Override + public Site getSite() { + return Site.me().addStartUrl("https://github.com/code4craft/webmagic"); + } + + @Test + public void test() { + OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() { + @Override + public void process(ResultItems resultItems, Task task) { + Assert.assertEquals("78",((String)resultItems.get("star")).trim()); + Assert.assertEquals("65",((String)resultItems.get("fork")).trim()); + } + }).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java index 69adabb..074dd0f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java @@ -14,8 +14,6 @@ import java.util.Scanner; /** * @author code4crafter@gmail.com
- * Date: 13-8-7
- * Time: 下午9:24
*/ public class QuickStarter { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index 79a20ff..e8998ec 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -14,8 +14,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-10
- * Time: 下午6:37
*/ @TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"}) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java index ae94525..7e3dc51 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java @@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{ } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run(); + OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run(); } public String getTitle() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java index bba8d82..de3fdf5 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java @@ -10,8 +10,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
- * Date: 13-8-11
- * Time: 下午9:29
*/ @TargetUrl("http://www.36kr.com/p/\\d+.html") @HelpUrl("http://www.36kr.com/#/page/\\d+") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 946e737..e9dfb26 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -16,8 +16,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-4
- * Time: 下午8:17
*/ @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") public class News163 implements MultiPageModel { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java index e878633..112f86a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java @@ -9,8 +9,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
- * Date: 13-8-3
- * Time: 下午8:25
*/ @TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") @HelpUrl("http://www.oschina.net/question/*") diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 96de977..7819b44 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -11,8 +11,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-2
- * Time: 上午7:52
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog implements HasKey{ diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index a1189e4..25baa1f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class DiandianBlogProcessor implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java deleted file mode 100644 index 3ceba0a..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ /dev/null @@ -1,46 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.selector.PlainText; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 - */ -public class DiaoyuwengProcessor implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); - page.addTargetRequests(requests); - requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); - page.addTargetRequests(requests); - if (page.getUrl().toString().contains("thread")){ - page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); - page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); - page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); - page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); - } - } - - @Override - public Site getSite() { - if (site==null){ - site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new DiaoyuwengProcessor()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java deleted file mode 100644 index 3d27be8..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ /dev/null @@ -1,34 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 - */ -public class F58PageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().links().regex(".*/yewu/.*").all(); - page.addTargetRequests(strings); - page.putField("title",page.getHtml().regex("(.*)")); - page.putField("body",page.getHtml().xpath("//dd")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. - } - - public static void main(String[] args) { - Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 136eeb8..7cb7be2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -9,8 +9,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class HuxiuProcessor implements PageProcessor { @Override @@ -18,13 +16,12 @@ public class HuxiuProcessor implements PageProcessor { List requests = page.getHtml().links().regex(".*article.*").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); - page.putField("content",page.getHtml().smartContent()); + page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()")); } @Override public Site getSite() { - return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"); } public static void main(String[] args) { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 38de3bc..3ef3957 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -10,8 +10,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class InfoQMiniBookProcessor implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index f80f895..26b85e8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 上午7:31
*/ public class IteyeBlogProcessor implements PageProcessor { @@ -24,8 +22,7 @@ public class IteyeBlogProcessor implements PageProcessor { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"). - setSleepTime(100).setRetryTimes(3); + site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"); } return site; } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java deleted file mode 100644 index 0ab6c64..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ /dev/null @@ -1,32 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- * Date: 13-5-20 - * Time: 下午5:31 - */ -public class KaichibaProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; - page.addTargetRequest("http://kaichiba.com/shop/" + i); - page.putField("title",page.getHtml().xpath("//Title")); - page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace(".*?", "")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - - public static void main(String[] args) { - Spider.create(new KaichibaProcessor()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java deleted file mode 100644 index bfa347d..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ /dev/null @@ -1,38 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- * Date: 13-5-20 - * Time: 下午5:31 - */ -public class MeicanProcessor implements PageProcessor { - @Override - public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all(); - if (requests.size() > 2) { - requests = requests.subList(0, 2); - } - page.addTargetRequests(requests); - page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); - page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()")); - page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - - public static void main(String[] args) { - Spider.create(new MeicanProcessor()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index 2337da5..16dcb0c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -22,7 +22,6 @@ public class NjuBBSProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index e447003..ded1a5f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -9,8 +9,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class OschinaBlogPageProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index 522eb2c..b75cc83 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class OschinaPageProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index 49418b6..d9cee2b 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午8:08 */ public class QzoneBlogProcessor implements PageProcessor { @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index b4c5bc8..dcb6eff 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class SinaBlogProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index ecc55b4..d14b442 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -8,8 +8,6 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:48 */ public class TianyaPageProcesser implements PageProcessor { diff --git a/webmagic-samples/src/main/resources/combine.sh b/webmagic-samples/src/main/resources/combine.sh deleted file mode 100644 index 0e7bd0c..0000000 --- a/webmagic-samples/src/main/resources/combine.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh -touch wordpress.xml -cat wp-head.xml >> wordpress.xml -for f in `ls`; - do - cat ${f} >> ../wordpress.xml - done; -cat wp-bottom.xml >> wordpress.xml \ No newline at end of file diff --git a/webmagic-samples/src/main/resources/ftl/wordpress.ftl b/webmagic-samples/src/main/resources/ftl/wordpress.ftl deleted file mode 100644 index f2feeb1..0000000 --- a/webmagic-samples/src/main/resources/ftl/wordpress.ftl +++ /dev/null @@ -1,22 +0,0 @@ - - ${title} - http://127.0.0.1/wordpress/?p=${id} - ${date} - admin - http://127.0.0.1/wordpress/?p=${id} - - - - ${id} - ${date} - ${date} - open - open - ${title} - publish - 0 - 0 - post - - 0 - diff --git a/webmagic-samples/src/main/resources/wp-bottom.xml b/webmagic-samples/src/main/resources/wp-bottom.xml deleted file mode 100644 index f651c3b..0000000 --- a/webmagic-samples/src/main/resources/wp-bottom.xml +++ /dev/null @@ -1,2 +0,0 @@ - - \ No newline at end of file diff --git a/webmagic-samples/src/main/resources/wp-head.xml b/webmagic-samples/src/main/resources/wp-head.xml deleted file mode 100644 index 8330ba1..0000000 --- a/webmagic-samples/src/main/resources/wp-head.xml +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - 1.1 - http://127.0.0.1/wordpress - http://127.0.0.1/wordpress - - 1adminflashsword20@163.com - - - http://wordpress.org/?v=3.3.1 diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java deleted file mode 100644 index 0371eb2..0000000 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package us.codecraft.webmagic.processor; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; -import us.codecraft.webmagic.pipeline.JsonFilePipeline; -import us.codecraft.webmagic.samples.DiaoyuwengProcessor; -import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; - -import java.io.IOException; - -/** - * @author code4crafter@gmail.com
- * Date: 13-6-9 - * Time: 上午8:02 - */ -public class DiaoyuwengProcessorTest { - - @Ignore - @Test - public void test() throws IOException { - DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor(); - JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); - Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). - run(); - } -}