This reverts commit f051d978e2
.
master
parent
f051d978e2
commit
31548deb93
|
@ -169,25 +169,18 @@ public class Page {
|
||||||
* @param priority Priority for the URL
|
* @param priority Priority for the URL
|
||||||
*/
|
*/
|
||||||
private void addRequestIfValid(String url, long priority) {
|
private void addRequestIfValid(String url, long priority) {
|
||||||
boolean isBlankUrl = StringUtils.isBlank(url);
|
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
|
||||||
boolean isHashSymbol = url.equals("#");
|
return;
|
||||||
boolean isJavaScript = url.startsWith("javascript:");
|
|
||||||
|
|
||||||
if (isBlankUrl || isHashSymbol || isJavaScript) {
|
|
||||||
return; // Invalid URL, so no further processing is needed.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
|
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
|
||||||
Request request = new Request(canonicalizedUrl);
|
Request req = new Request(canonicalizedUrl);
|
||||||
|
if(priority > 0) {
|
||||||
if (priority > 0) {
|
req.setPriority(priority);
|
||||||
request.setPriority(priority);
|
|
||||||
}
|
}
|
||||||
|
targetRequests.add(req);
|
||||||
targetRequests.add(request);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add url to fetch
|
* add url to fetch
|
||||||
*
|
*
|
||||||
|
|
|
@ -40,14 +40,13 @@ public class HttpClientGenerator {
|
||||||
|
|
||||||
private PoolingHttpClientConnectionManager connectionManager;
|
private PoolingHttpClientConnectionManager connectionManager;
|
||||||
|
|
||||||
private static final int DEFAULT_MAX_PER_ROUTE = 100;
|
|
||||||
public HttpClientGenerator() {
|
public HttpClientGenerator() {
|
||||||
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
|
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
|
||||||
.register("http", PlainConnectionSocketFactory.INSTANCE)
|
.register("http", PlainConnectionSocketFactory.INSTANCE)
|
||||||
.register("https", buildSSLConnectionSocketFactory())
|
.register("https", buildSSLConnectionSocketFactory())
|
||||||
.build();
|
.build();
|
||||||
connectionManager = new PoolingHttpClientConnectionManager(reg);
|
connectionManager = new PoolingHttpClientConnectionManager(reg);
|
||||||
connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE);
|
connectionManager.setDefaultMaxPerRoute(100);
|
||||||
}
|
}
|
||||||
|
|
||||||
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
|
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
|
||||||
|
|
|
@ -64,7 +64,7 @@ public class HttpRequestBody implements Serializable {
|
||||||
this.encoding = encoding;
|
this.encoding = encoding;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HttpRequestBody createJsonRequestBody(String json, String encoding) {
|
public static HttpRequestBody json(String json, String encoding) {
|
||||||
try {
|
try {
|
||||||
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
|
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
|
||||||
} catch (UnsupportedEncodingException e) {
|
} catch (UnsupportedEncodingException e) {
|
||||||
|
|
|
@ -1,53 +0,0 @@
|
||||||
package us.codecraft.webmagic.selector;
|
|
||||||
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.ListIterator;
|
|
||||||
|
|
||||||
public class ElementsUtil {
|
|
||||||
HtmlNode htmlNode = new HtmlNode();
|
|
||||||
public Selectable selectElements(BaseElementSelector elementSelector) {
|
|
||||||
ListIterator<Element> elementIterator = htmlNode.getElements().listIterator();
|
|
||||||
if (!elementSelector.hasAttribute()) {
|
|
||||||
List<Element> resultElements = new ArrayList<Element>();
|
|
||||||
while (elementIterator.hasNext()) {
|
|
||||||
Element element = checkElementAndConvert(elementIterator);
|
|
||||||
List<Element> selectElements = elementSelector.selectElements(element);
|
|
||||||
resultElements.addAll(selectElements);
|
|
||||||
}
|
|
||||||
return new HtmlNode(resultElements);
|
|
||||||
} else {
|
|
||||||
// has attribute, consider as plaintext
|
|
||||||
List<String> resultStrings = new ArrayList<String>();
|
|
||||||
while (elementIterator.hasNext()) {
|
|
||||||
Element element = checkElementAndConvert(elementIterator);
|
|
||||||
List<String> selectList = elementSelector.selectList(element);
|
|
||||||
resultStrings.addAll(selectList);
|
|
||||||
}
|
|
||||||
return new PlainText(resultStrings);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Only document can be select
|
|
||||||
* See: https://github.com/code4craft/webmagic/issues/113
|
|
||||||
*
|
|
||||||
* @param elementIterator elementIterator
|
|
||||||
* @return element element
|
|
||||||
*/
|
|
||||||
public Element checkElementAndConvert(ListIterator<Element> elementIterator) {
|
|
||||||
Element element = elementIterator.next();
|
|
||||||
if (!(element instanceof Document)) {
|
|
||||||
Document root = new Document(element.ownerDocument().baseUri());
|
|
||||||
Element clone = element.clone();
|
|
||||||
root.appendChild(clone);
|
|
||||||
elementIterator.set(root);
|
|
||||||
return root;
|
|
||||||
}
|
|
||||||
return element;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -33,22 +33,19 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable links() {
|
public Selectable links() {
|
||||||
ElementsUtil elementsUtil = new ElementsUtil();
|
return selectElements(new LinksSelector());
|
||||||
return elementsUtil.selectElements(new LinksSelector());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable xpath(String xpath) {
|
public Selectable xpath(String xpath) {
|
||||||
ElementsUtil elementsUtil = new ElementsUtil();
|
|
||||||
XpathSelector xpathSelector = Selectors.xpath(xpath);
|
XpathSelector xpathSelector = Selectors.xpath(xpath);
|
||||||
return elementsUtil.selectElements(xpathSelector);
|
return selectElements(xpathSelector);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable selectList(Selector selector) {
|
public Selectable selectList(Selector selector) {
|
||||||
if (selector instanceof BaseElementSelector) {
|
if (selector instanceof BaseElementSelector) {
|
||||||
ElementsUtil elementsUtil = new ElementsUtil();
|
return selectElements((BaseElementSelector) selector);
|
||||||
return elementsUtil.selectElements((BaseElementSelector) selector);
|
|
||||||
}
|
}
|
||||||
return selectList(selector, getSourceTexts());
|
return selectList(selector, getSourceTexts());
|
||||||
}
|
}
|
||||||
|
@ -58,18 +55,64 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
return selectList(selector);
|
return selectList(selector);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* select elements
|
||||||
|
*
|
||||||
|
* @param elementSelector elementSelector
|
||||||
|
* @return result
|
||||||
|
*/
|
||||||
|
protected Selectable selectElements(BaseElementSelector elementSelector) {
|
||||||
|
ListIterator<Element> elementIterator = getElements().listIterator();
|
||||||
|
if (!elementSelector.hasAttribute()) {
|
||||||
|
List<Element> resultElements = new ArrayList<Element>();
|
||||||
|
while (elementIterator.hasNext()) {
|
||||||
|
Element element = checkElementAndConvert(elementIterator);
|
||||||
|
List<Element> selectElements = elementSelector.selectElements(element);
|
||||||
|
resultElements.addAll(selectElements);
|
||||||
|
}
|
||||||
|
return new HtmlNode(resultElements);
|
||||||
|
} else {
|
||||||
|
// has attribute, consider as plaintext
|
||||||
|
List<String> resultStrings = new ArrayList<String>();
|
||||||
|
while (elementIterator.hasNext()) {
|
||||||
|
Element element = checkElementAndConvert(elementIterator);
|
||||||
|
List<String> selectList = elementSelector.selectList(element);
|
||||||
|
resultStrings.addAll(selectList);
|
||||||
|
}
|
||||||
|
return new PlainText(resultStrings);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Only document can be select
|
||||||
|
* See: https://github.com/code4craft/webmagic/issues/113
|
||||||
|
*
|
||||||
|
* @param elementIterator elementIterator
|
||||||
|
* @return element element
|
||||||
|
*/
|
||||||
|
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
|
||||||
|
Element element = elementIterator.next();
|
||||||
|
if (!(element instanceof Document)) {
|
||||||
|
Document root = new Document(element.ownerDocument().baseUri());
|
||||||
|
Element clone = element.clone();
|
||||||
|
root.appendChild(clone);
|
||||||
|
elementIterator.set(root);
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector) {
|
public Selectable $(String selector) {
|
||||||
ElementsUtil elementsUtil = new ElementsUtil();
|
|
||||||
CssSelector cssSelector = Selectors.$(selector);
|
CssSelector cssSelector = Selectors.$(selector);
|
||||||
return elementsUtil.selectElements(cssSelector);
|
return selectElements(cssSelector);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector, String attrName) {
|
public Selectable $(String selector, String attrName) {
|
||||||
ElementsUtil elementsUtil = new ElementsUtil();
|
|
||||||
CssSelector cssSelector = Selectors.$(selector, attrName);
|
CssSelector cssSelector = Selectors.$(selector, attrName);
|
||||||
return elementsUtil.selectElements(cssSelector);
|
return selectElements(cssSelector);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -76,27 +76,26 @@ public class ExtractRule {
|
||||||
}
|
}
|
||||||
|
|
||||||
private Selector compileSelector() {
|
private Selector compileSelector() {
|
||||||
SelectorFactory factory;
|
|
||||||
switch (expressionType) {
|
switch (expressionType) {
|
||||||
case Css:
|
case Css:
|
||||||
factory = new CssSelectorFactory();
|
if (expressionParams.length >= 1) {
|
||||||
break;
|
return $(expressionValue, expressionParams[0]);
|
||||||
|
} else {
|
||||||
|
return $(expressionValue);
|
||||||
|
}
|
||||||
case XPath:
|
case XPath:
|
||||||
factory = new XPathSelectorFactory();
|
return xpath(expressionValue);
|
||||||
break;
|
|
||||||
case Regex:
|
case Regex:
|
||||||
factory = new RegexSelectorFactory();
|
if (expressionParams.length >= 1) {
|
||||||
break;
|
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
|
||||||
|
} else {
|
||||||
|
return regex(expressionValue);
|
||||||
|
}
|
||||||
case JsonPath:
|
case JsonPath:
|
||||||
factory = new JsonPathSelectorFactory();
|
return new JsonPathSelector(expressionValue);
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
factory = new XPathSelectorFactory(); // Default to XPath
|
return xpath(expressionValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
SelectorCompiler selectorCompiler = new SelectorCompiler(factory);
|
|
||||||
Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams);
|
|
||||||
return compiledSelector;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setSelector(Selector selector) {
|
public void setSelector(Selector selector) {
|
||||||
|
|
|
@ -1,57 +0,0 @@
|
||||||
package us.codecraft.webmagic.configurable;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.selector.JsonPathSelector;
|
|
||||||
import us.codecraft.webmagic.selector.Selector;
|
|
||||||
|
|
||||||
import static us.codecraft.webmagic.selector.Selectors.*;
|
|
||||||
public interface SelectorFactory {
|
|
||||||
Selector compileSelector(String expressionValue, String[] expressionParams);
|
|
||||||
}
|
|
||||||
|
|
||||||
class CssSelectorFactory implements SelectorFactory {
|
|
||||||
@Override
|
|
||||||
public Selector compileSelector(String expressionValue, String[] expressionParams) {
|
|
||||||
if (expressionParams.length >= 1) {
|
|
||||||
return $(expressionValue, expressionParams[0]);
|
|
||||||
} else {
|
|
||||||
return $(expressionValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class XPathSelectorFactory implements SelectorFactory {
|
|
||||||
@Override
|
|
||||||
public Selector compileSelector(String expressionValue, String[] expressionParams) {
|
|
||||||
return xpath(expressionValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class RegexSelectorFactory implements SelectorFactory {
|
|
||||||
@Override
|
|
||||||
public Selector compileSelector(String expressionValue, String[] expressionParams) {
|
|
||||||
if (expressionParams.length >= 1) {
|
|
||||||
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
|
|
||||||
} else {
|
|
||||||
return regex(expressionValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class JsonPathSelectorFactory implements SelectorFactory {
|
|
||||||
@Override
|
|
||||||
public Selector compileSelector(String expressionValue, String[] expressionParams) {
|
|
||||||
return new JsonPathSelector(expressionValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class SelectorCompiler {
|
|
||||||
private final SelectorFactory selectorFactory;
|
|
||||||
|
|
||||||
public SelectorCompiler(SelectorFactory selectorFactory) {
|
|
||||||
this.selectorFactory = selectorFactory;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Selector compileSelector(String expressionValue, String[] expressionParams) {
|
|
||||||
return selectorFactory.compileSelector(expressionValue, expressionParams);
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue