Revert "Refactored code for increased optimization. (#1139)" (#1153)

This reverts commit f051d978e2.
master
Sutra Zhou 2024-03-30 14:37:55 +08:00 committed by GitHub
parent f051d978e2
commit 31548deb93
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 74 additions and 150 deletions

View File

@ -169,25 +169,18 @@ public class Page {
* @param priority Priority for the URL * @param priority Priority for the URL
*/ */
private void addRequestIfValid(String url, long priority) { private void addRequestIfValid(String url, long priority) {
boolean isBlankUrl = StringUtils.isBlank(url); if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
boolean isHashSymbol = url.equals("#"); return;
boolean isJavaScript = url.startsWith("javascript:");
if (isBlankUrl || isHashSymbol || isJavaScript) {
return; // Invalid URL, so no further processing is needed.
} }
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
Request request = new Request(canonicalizedUrl); Request req = new Request(canonicalizedUrl);
if(priority > 0) { if(priority > 0) {
request.setPriority(priority); req.setPriority(priority);
} }
targetRequests.add(req);
targetRequests.add(request);
} }
/** /**
* add url to fetch * add url to fetch
* *

View File

@ -40,14 +40,13 @@ public class HttpClientGenerator {
private PoolingHttpClientConnectionManager connectionManager; private PoolingHttpClientConnectionManager connectionManager;
private static final int DEFAULT_MAX_PER_ROUTE = 100;
public HttpClientGenerator() { public HttpClientGenerator() {
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create() Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE) .register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", buildSSLConnectionSocketFactory()) .register("https", buildSSLConnectionSocketFactory())
.build(); .build();
connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE); connectionManager.setDefaultMaxPerRoute(100);
} }
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {

View File

@ -64,7 +64,7 @@ public class HttpRequestBody implements Serializable {
this.encoding = encoding; this.encoding = encoding;
} }
public static HttpRequestBody createJsonRequestBody(String json, String encoding) { public static HttpRequestBody json(String json, String encoding) {
try { try {
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {

View File

@ -1,53 +0,0 @@
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
public class ElementsUtil {
HtmlNode htmlNode = new HtmlNode();
public Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator<Element> elementIterator = htmlNode.getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
/**
* Only document can be select
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
* @return element element
*/
public Element checkElementAndConvert(ListIterator<Element> elementIterator) {
Element element = elementIterator.next();
if (!(element instanceof Document)) {
Document root = new Document(element.ownerDocument().baseUri());
Element clone = element.clone();
root.appendChild(clone);
elementIterator.set(root);
return root;
}
return element;
}
}

View File

@ -33,22 +33,19 @@ public class HtmlNode extends AbstractSelectable {
@Override @Override
public Selectable links() { public Selectable links() {
ElementsUtil elementsUtil = new ElementsUtil(); return selectElements(new LinksSelector());
return elementsUtil.selectElements(new LinksSelector());
} }
@Override @Override
public Selectable xpath(String xpath) { public Selectable xpath(String xpath) {
ElementsUtil elementsUtil = new ElementsUtil();
XpathSelector xpathSelector = Selectors.xpath(xpath); XpathSelector xpathSelector = Selectors.xpath(xpath);
return elementsUtil.selectElements(xpathSelector); return selectElements(xpathSelector);
} }
@Override @Override
public Selectable selectList(Selector selector) { public Selectable selectList(Selector selector) {
if (selector instanceof BaseElementSelector) { if (selector instanceof BaseElementSelector) {
ElementsUtil elementsUtil = new ElementsUtil(); return selectElements((BaseElementSelector) selector);
return elementsUtil.selectElements((BaseElementSelector) selector);
} }
return selectList(selector, getSourceTexts()); return selectList(selector, getSourceTexts());
} }
@ -58,18 +55,64 @@ public class HtmlNode extends AbstractSelectable {
return selectList(selector); return selectList(selector);
} }
/**
* select elements
*
* @param elementSelector elementSelector
* @return result
*/
protected Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator<Element> elementIterator = getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
/**
* Only document can be select
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
* @return element element
*/
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
Element element = elementIterator.next();
if (!(element instanceof Document)) {
Document root = new Document(element.ownerDocument().baseUri());
Element clone = element.clone();
root.appendChild(clone);
elementIterator.set(root);
return root;
}
return element;
}
@Override @Override
public Selectable $(String selector) { public Selectable $(String selector) {
ElementsUtil elementsUtil = new ElementsUtil();
CssSelector cssSelector = Selectors.$(selector); CssSelector cssSelector = Selectors.$(selector);
return elementsUtil.selectElements(cssSelector); return selectElements(cssSelector);
} }
@Override @Override
public Selectable $(String selector, String attrName) { public Selectable $(String selector, String attrName) {
ElementsUtil elementsUtil = new ElementsUtil();
CssSelector cssSelector = Selectors.$(selector, attrName); CssSelector cssSelector = Selectors.$(selector, attrName);
return elementsUtil.selectElements(cssSelector); return selectElements(cssSelector);
} }
@Override @Override

View File

@ -76,27 +76,26 @@ public class ExtractRule {
} }
private Selector compileSelector() { private Selector compileSelector() {
SelectorFactory factory;
switch (expressionType) { switch (expressionType) {
case Css: case Css:
factory = new CssSelectorFactory(); if (expressionParams.length >= 1) {
break; return $(expressionValue, expressionParams[0]);
case XPath: } else {
factory = new XPathSelectorFactory(); return $(expressionValue);
break; }
case Regex: case XPath:
factory = new RegexSelectorFactory(); return xpath(expressionValue);
break; case Regex:
case JsonPath: if (expressionParams.length >= 1) {
factory = new JsonPathSelectorFactory(); return regex(expressionValue, Integer.parseInt(expressionParams[0]));
break; } else {
default: return regex(expressionValue);
factory = new XPathSelectorFactory(); // Default to XPath }
case JsonPath:
return new JsonPathSelector(expressionValue);
default:
return xpath(expressionValue);
} }
SelectorCompiler selectorCompiler = new SelectorCompiler(factory);
Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams);
return compiledSelector;
} }
public void setSelector(Selector selector) { public void setSelector(Selector selector) {

View File

@ -1,57 +0,0 @@
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.webmagic.selector.Selector;
import static us.codecraft.webmagic.selector.Selectors.*;
public interface SelectorFactory {
Selector compileSelector(String expressionValue, String[] expressionParams);
}
class CssSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
if (expressionParams.length >= 1) {
return $(expressionValue, expressionParams[0]);
} else {
return $(expressionValue);
}
}
}
class XPathSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
return xpath(expressionValue);
}
}
class RegexSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
if (expressionParams.length >= 1) {
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
} else {
return regex(expressionValue);
}
}
}
class JsonPathSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
return new JsonPathSelector(expressionValue);
}
}
class SelectorCompiler {
private final SelectorFactory selectorFactory;
public SelectorCompiler(SelectorFactory selectorFactory) {
this.selectorFactory = selectorFactory;
}
public Selector compileSelector(String expressionValue, String[] expressionParams) {
return selectorFactory.compileSelector(expressionValue, expressionParams);
}
}