update xsoup version to release #113
parent
a5d1b56e44
commit
95bdb30296
2
pom.xml
2
pom.xml
|
@ -88,7 +88,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>xsoup</artifactId>
|
<artifactId>xsoup</artifactId>
|
||||||
<version>0.2.4-SNAPSHOT</version>
|
<version>0.2.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.alibaba</groupId>
|
<groupId>com.alibaba</groupId>
|
||||||
|
|
|
@ -10,6 +10,7 @@ import us.codecraft.webmagic.selector.Selectable;
|
||||||
|
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafer@gmail.com
|
* @author code4crafer@gmail.com
|
||||||
|
@ -20,11 +21,14 @@ public class MamacnPageProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
Selectable images = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li");
|
List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
|
||||||
page.putField("img", images.xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@src").get());
|
StringBuilder accum = new StringBuilder();
|
||||||
page.putField("title", page.getHtml().xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@alt").get());
|
for (Selectable node : nodes) {
|
||||||
page.putField("url", page.getUrl().toString());
|
accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
|
||||||
if (page.getResultItems().get("title") == null) {
|
accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
|
||||||
|
}
|
||||||
|
page.putField("",accum.toString());
|
||||||
|
if (accum.length() == 0) {
|
||||||
page.setSkip(true);
|
page.setSkip(true);
|
||||||
}
|
}
|
||||||
page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
|
page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
|
||||||
|
|
Loading…
Reference in New Issue