refactor selectable for html fragment #113
parent
03d26c169b
commit
f9825c214a
|
@ -0,0 +1,112 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
* @since 0.5.2
|
||||||
|
*/
|
||||||
|
public abstract class AbstractSelectable implements Selectable {
|
||||||
|
|
||||||
|
protected List<String> strings;
|
||||||
|
|
||||||
|
public AbstractSelectable(String text) {
|
||||||
|
List<String> results = new ArrayList<String>();
|
||||||
|
results.add(text);
|
||||||
|
this.strings = results;
|
||||||
|
}
|
||||||
|
|
||||||
|
public AbstractSelectable(List<String> strings) {
|
||||||
|
this.strings = strings;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable css(String selector) {
|
||||||
|
return $(selector);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable css(String selector, String attrName) {
|
||||||
|
return $(selector, attrName);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Selectable select(Selector selector, List<String> strings) {
|
||||||
|
List<String> results = new ArrayList<String>();
|
||||||
|
for (String string : strings) {
|
||||||
|
String result = selector.select(string);
|
||||||
|
if (result != null) {
|
||||||
|
results.add(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new PlainText(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Selectable selectList(Selector selector, List<String> strings) {
|
||||||
|
List<String> results = new ArrayList<String>();
|
||||||
|
for (String string : strings) {
|
||||||
|
List<String> result = selector.selectList(string);
|
||||||
|
results.addAll(result);
|
||||||
|
}
|
||||||
|
return new PlainText(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> all() {
|
||||||
|
return strings;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable jsonPath(String jsonPath) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String get() {
|
||||||
|
if (CollectionUtils.isNotEmpty(all())) {
|
||||||
|
return all().get(0);
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable select(Selector selector) {
|
||||||
|
return select(selector, strings);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable selectList(Selector selector) {
|
||||||
|
return selectList(selector, strings);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable regex(String regex) {
|
||||||
|
RegexSelector regexSelector = Selectors.regex(regex);
|
||||||
|
return selectList(regexSelector, strings);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable regex(String regex, int group) {
|
||||||
|
RegexSelector regexSelector = Selectors.regex(regex, group);
|
||||||
|
return selectList(regexSelector, strings);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable replace(String regex, String replacement) {
|
||||||
|
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
|
||||||
|
return select(replaceSelector, strings);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean match() {
|
||||||
|
return strings != null && strings.size() > 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,8 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -28,4 +30,23 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Element selectElement(String text) {
|
||||||
|
if (text != null) {
|
||||||
|
return selectElement(Jsoup.parse(text));
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Elements selectElements(String text) {
|
||||||
|
if (text != null) {
|
||||||
|
return selectElements(Jsoup.parse(text));
|
||||||
|
} else {
|
||||||
|
return new Elements();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract Element selectElement(Element element);
|
||||||
|
|
||||||
|
public abstract Elements selectElements(Element element);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String select(Element element) {
|
public String select(Element element) {
|
||||||
Elements elements = element.select(selectorText);
|
Elements elements = selectElements(element);
|
||||||
if (CollectionUtils.isEmpty(elements)) {
|
if (CollectionUtils.isEmpty(elements)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector {
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(Element doc) {
|
public List<String> selectList(Element doc) {
|
||||||
List<String> strings = new ArrayList<String>();
|
List<String> strings = new ArrayList<String>();
|
||||||
Elements elements = doc.select(selectorText);
|
Elements elements = selectElements(doc);
|
||||||
if (CollectionUtils.isNotEmpty(elements)) {
|
if (CollectionUtils.isNotEmpty(elements)) {
|
||||||
for (Element element : elements) {
|
for (Element element : elements) {
|
||||||
String value = getValue(element);
|
String value = getValue(element);
|
||||||
|
@ -78,4 +78,18 @@ public class CssSelector extends BaseElementSelector {
|
||||||
}
|
}
|
||||||
return strings;
|
return strings;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Element selectElement(Element element) {
|
||||||
|
Elements elements = element.select(selectorText);
|
||||||
|
if (CollectionUtils.isNotEmpty(elements)) {
|
||||||
|
return elements.get(0);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Elements selectElements(Element element) {
|
||||||
|
return element.select(selectorText);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -142,6 +142,13 @@ public class Html extends PlainText {
|
||||||
return document.html();
|
return document.html();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Selectable> nodes() {
|
||||||
|
ArrayList<Selectable> selectables = new ArrayList<Selectable>();
|
||||||
|
selectables.add(this);
|
||||||
|
return selectables;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param selector
|
* @param selector
|
||||||
* @return
|
* @return
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class HtmlFragment {
|
||||||
|
}
|
|
@ -1,7 +1,5 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -12,18 +10,14 @@ import java.util.List;
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public class PlainText implements Selectable {
|
public class PlainText extends AbstractSelectable {
|
||||||
|
|
||||||
protected List<String> strings;
|
|
||||||
|
|
||||||
public PlainText(List<String> strings) {
|
public PlainText(List<String> strings) {
|
||||||
this.strings = strings;
|
super(strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
public PlainText(String text) {
|
public PlainText(String text) {
|
||||||
List<String> results = new ArrayList<String>();
|
super(text);
|
||||||
results.add(text);
|
|
||||||
this.strings = results;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static PlainText create(String text) {
|
public static PlainText create(String text) {
|
||||||
|
@ -45,16 +39,6 @@ public class PlainText implements Selectable {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable css(String selector) {
|
|
||||||
return $(selector);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable css(String selector, String attrName) {
|
|
||||||
return $(selector, attrName);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable smartContent() {
|
public Selectable smartContent() {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
|
@ -66,79 +50,12 @@ public class PlainText implements Selectable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable regex(String regex) {
|
public List<Selectable> nodes() {
|
||||||
RegexSelector regexSelector = Selectors.regex(regex);
|
List<Selectable> nodes = new ArrayList<Selectable>(strings.size());
|
||||||
return selectList(regexSelector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable regex(String regex, int group) {
|
|
||||||
RegexSelector regexSelector = Selectors.regex(regex, group);
|
|
||||||
return selectList(regexSelector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Selectable select(Selector selector, List<String> strings) {
|
|
||||||
List<String> results = new ArrayList<String>();
|
|
||||||
for (String string : strings) {
|
for (String string : strings) {
|
||||||
String result = selector.select(string);
|
nodes.add(PlainText.create(string));
|
||||||
if (result != null) {
|
|
||||||
results.add(result);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return new PlainText(results);
|
return nodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Selectable selectList(Selector selector, List<String> strings) {
|
|
||||||
List<String> results = new ArrayList<String>();
|
|
||||||
for (String string : strings) {
|
|
||||||
List<String> result = selector.selectList(string);
|
|
||||||
results.addAll(result);
|
|
||||||
}
|
|
||||||
return new PlainText(results);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable replace(String regex, String replacement) {
|
|
||||||
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
|
|
||||||
return select(replaceSelector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<String> all() {
|
|
||||||
return strings;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable jsonPath(String jsonPath) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String get() {
|
|
||||||
if (CollectionUtils.isNotEmpty(all())) {
|
|
||||||
return all().get(0);
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable select(Selector selector) {
|
|
||||||
return select(selector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable selectList(Selector selector) {
|
|
||||||
return selectList(selector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return get();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean match() {
|
|
||||||
return strings != null && strings.size() > 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -143,4 +143,10 @@ public interface Selectable {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Selectable selectList(Selector selector);
|
public Selectable selectList(Selector selector);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get all nodes
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public List<Selectable> nodes();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
import us.codecraft.xsoup.XPathEvaluator;
|
import us.codecraft.xsoup.XPathEvaluator;
|
||||||
import us.codecraft.xsoup.Xsoup;
|
import us.codecraft.xsoup.Xsoup;
|
||||||
|
|
||||||
|
@ -29,4 +31,18 @@ public class XpathSelector extends BaseElementSelector {
|
||||||
public List<String> selectList(Element element) {
|
public List<String> selectList(Element element) {
|
||||||
return xPathEvaluator.evaluate(element).list();
|
return xPathEvaluator.evaluate(element).list();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Element selectElement(Element element) {
|
||||||
|
Elements elements = selectElements(element);
|
||||||
|
if (CollectionUtils.isNotEmpty(elements)){
|
||||||
|
return elements.get(0);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Elements selectElements(Element element) {
|
||||||
|
return xPathEvaluator.evaluate(element).getElements();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.samples.pipeline.OneFilePipeline;
|
||||||
|
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||||
|
import us.codecraft.webmagic.selector.Selectable;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class MamacnPageProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
private Site site = Site.me().setDomain("www.mama.cn").setSleepTime(100);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
Selectable images = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li");
|
||||||
|
page.putField("img", images.xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@src").get());
|
||||||
|
page.putField("title", page.getHtml().xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@alt").get());
|
||||||
|
page.putField("url", page.getUrl().toString());
|
||||||
|
if (page.getResultItems().get("title") == null) {
|
||||||
|
page.setSkip(true);
|
||||||
|
}
|
||||||
|
page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return site;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
|
||||||
|
Spider.create(new MamacnPageProcessor())
|
||||||
|
.setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn"))
|
||||||
|
.addUrl("http://www.mama.cn/photo/t1-p1.html")
|
||||||
|
.addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data"))
|
||||||
|
.thread(5)
|
||||||
|
.run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
package us.codecraft.webmagic.samples.pipeline;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import us.codecraft.webmagic.ResultItems;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class OneFilePipeline extends FilePersistentBase implements Pipeline {
|
||||||
|
|
||||||
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private PrintWriter printWriter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a FilePipeline with default path"/data/webmagic/"
|
||||||
|
*/
|
||||||
|
public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
|
||||||
|
this("/data/webmagic/");
|
||||||
|
}
|
||||||
|
|
||||||
|
public OneFilePipeline(String path) throws FileNotFoundException, UnsupportedEncodingException {
|
||||||
|
setPath(path);
|
||||||
|
printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path)), "UTF-8"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void process(ResultItems resultItems, Task task) {
|
||||||
|
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||||
|
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||||
|
if (entry.getValue() instanceof Iterable) {
|
||||||
|
Iterable value = (Iterable) entry.getValue();
|
||||||
|
printWriter.println(entry.getKey() + ":");
|
||||||
|
for (Object o : value) {
|
||||||
|
printWriter.println(o);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printWriter.flush();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue