update pom
parent
55d4a76ab7
commit
d7cd9e5747
|
@ -59,7 +59,7 @@ public class Html extends PlainText {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable xpath(String xpath) {
|
public Selectable xpath(String xpath) {
|
||||||
XpathSelector xpathSelector = Selectors.xpath(xpath);
|
XsoupSelector xpathSelector = new XsoupSelector(xpath);
|
||||||
return selectList(xpathSelector, strings);
|
return selectList(xpathSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,7 @@
|
||||||
package us.codecraft.webmagic.utils;
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
import us.codecraft.webmagic.selector.CssSelector;
|
import us.codecraft.webmagic.selector.*;
|
||||||
import us.codecraft.webmagic.selector.RegexSelector;
|
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
|
||||||
import us.codecraft.webmagic.selector.XpathSelector;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -27,10 +24,10 @@ public class ExtractorUtils {
|
||||||
selector = new RegexSelector(value);
|
selector = new RegexSelector(value);
|
||||||
break;
|
break;
|
||||||
case XPath:
|
case XPath:
|
||||||
selector = new XpathSelector(value);
|
selector = new XsoupSelector(value);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
selector = new XpathSelector(value);
|
selector = new XsoupSelector(value);
|
||||||
}
|
}
|
||||||
return selector;
|
return selector;
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.2.1</version>
|
<version>0.2.2-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
|
||||||
|
@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
if (page.getUrl().toString().contains("thread")){
|
if (page.getUrl().toString().contains("thread")){
|
||||||
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
|
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
|
||||||
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody"));
|
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
|
||||||
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
|
page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
|
||||||
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
|
page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
|
||||||
}
|
}
|
||||||
|
@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new DiaoyuwengProcessor()).run();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all();
|
List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
|
||||||
page.addTargetRequests(strings);
|
page.addTargetRequests(strings);
|
||||||
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
|
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
|
||||||
page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
|
page.putField("body",page.getHtml().xpath("//dd"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates.
|
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new F58PageProcesser()).run();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -26,4 +27,8 @@ public class HuxiuProcessor implements PageProcessor {
|
||||||
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
|
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new HuxiuProcessor()).run();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue