From c5a037a8072575b0938bfc26b0e326931f7a6b16 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 22 Jul 2021 13:02:46 +0800 Subject: [PATCH 01/16] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 51e6fdb..cda7ad1 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index ec718a1..049477c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 16ed1b4..e6e6068 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 85d5c63..741b081 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index dda1821..c5582c0 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 119e50f..d4d3efa 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 1aca5b3..fe4ef68 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 42a6da9..be36376 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.5 + 0.7.6-SNAPSHOT 4.0.0 From ab5d81a6b6ab215e3450cb2fde94df12c5e49544 Mon Sep 17 00:00:00 2001 From: "carl.don:tjr" Date: Wed, 4 Aug 2021 17:17:22 +0800 Subject: [PATCH 02/16] perfect Spider.run to avoid some rare concurrent issue, change the Spider.emptySleepTime to long type --- .../java/us/codecraft/webmagic/Spider.java | 89 +++++++++++++------ 1 file changed, 60 insertions(+), 29 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5940e73..65c0cee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -106,7 +106,7 @@ public class Spider implements Runnable, Task { private Date startTime; - private int emptySleepTime = 30000; + private long emptySleepTime = 30000; /** * create a spider with pageProcessor. @@ -305,32 +305,52 @@ public class Spider implements Runnable, Task { public void run() { checkRunningStat(); initComponent(); - logger.info("Spider {} started!",getUUID()); + logger.info("Spider {} started!", getUUID()); + // interrupt won't be necessarily detected while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { - final Request request = scheduler.poll(this); - if (request == null) { - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - break; - } - // wait until new url added - waitNewUrl(); - } else { - threadPool.execute(new Runnable() { - @Override - public void run() { - try { - processRequest(request); - onSuccess(request); - } catch (Exception e) { - onError(request, e); - logger.error("process request " + request + " error", e); - } finally { - pageCount.incrementAndGet(); - signalNewUrl(); + Request poll = scheduler.poll(this); + if (poll == null) { + if (threadPool.getThreadAlive() == 0) { + //no alive thread anymore , try again + poll = scheduler.poll(this); + if(poll==null) { + if (exitWhenComplete) { + break; + }else{ + // wait + try { + Thread.sleep(emptySleepTime); + continue; + } catch (InterruptedException e) { + break; + } } } - }); + }else { + // wait until new url added, + if(waitNewUrl()) + //if interrupted + break; + continue; + } } + final Request request = poll; + //this may swallow the interruption + threadPool.execute(new Runnable() { + @Override + public void run() { + try { + processRequest(request); + onSuccess(request); + } catch (Exception e) { + onError(request,e); + logger.error("process request " + request + " error", e); + } finally { + pageCount.incrementAndGet(); + signalNewUrl(); + } + } + }); } stat.set(STAT_STOPPED); // release some resources @@ -565,16 +585,24 @@ public class Spider implements Runnable, Task { return this; } - private void waitNewUrl() { + /** + * + * @return isInterrupted + */ + private boolean waitNewUrl() { + // now there may not be any thread live newUrlLock.lock(); try { - //double check - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - return; + //double check,unnecessary, unless very fast concurrent + if (threadPool.getThreadAlive() == 0) { + return false; } + //wait for amount of time newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); + return false; } catch (InterruptedException e) { - logger.warn("waitNewUrl - interrupted, error {}", e); + // logger.warn("waitNewUrl - interrupted, error {}", e); + return true; } finally { newUrlLock.unlock(); } @@ -772,7 +800,10 @@ public class Spider implements Runnable, Task { * * @param emptySleepTime In MILLISECONDS. */ - public void setEmptySleepTime(int emptySleepTime) { + public void setEmptySleepTime(long emptySleepTime) { + if(emptySleepTime<=0){ + throw new IllegalArgumentException("emptySleepTime should be more than zero!"); + } this.emptySleepTime = emptySleepTime; } } From fcdb9074d69543b81fd350075d182ce1eeaf26ac Mon Sep 17 00:00:00 2001 From: "carl.don:tjr" Date: Wed, 4 Aug 2021 18:23:04 +0800 Subject: [PATCH 03/16] =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E5=8C=96=20Spider.run?= =?UTF-8?q?=20=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/Spider.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 65c0cee..bc8bb94 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -313,10 +313,10 @@ public class Spider implements Runnable, Task { if (threadPool.getThreadAlive() == 0) { //no alive thread anymore , try again poll = scheduler.poll(this); - if(poll==null) { + if (poll == null) { if (exitWhenComplete) { break; - }else{ + } else { // wait try { Thread.sleep(emptySleepTime); @@ -326,9 +326,9 @@ public class Spider implements Runnable, Task { } } } - }else { + } else { // wait until new url added, - if(waitNewUrl()) + if (waitNewUrl()) //if interrupted break; continue; @@ -343,7 +343,7 @@ public class Spider implements Runnable, Task { processRequest(request); onSuccess(request); } catch (Exception e) { - onError(request,e); + onError(request, e); logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); From 34da2fb3a02708b562ec747679ef0cd8d171a042 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 24 Oct 2021 23:20:38 +0800 Subject: [PATCH 04/16] Make PageProcessor#getSite be default method. Closes #1040. --- .../webmagic/processor/PageProcessor.java | 26 +++++++----- .../webmagic/processor/PageProcessorTest.java | 40 +++++++++++++++++++ 2 files changed, 56 insertions(+), 10 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index 1fb125c..3d79b96 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,13 +4,16 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; /** - * Interface to be implemented to customize a crawler.
- *
+ * Interface to be implemented to customize a crawler. + * + *

* In PageProcessor, you can customize: - *
- * start urls and other settings in {@link Site}
- * how the urls to fetch are detected
- * how the data are extracted and stored
+ *

+ *
    + *
  • start URLs and other settings in {@link Site}
  • + *
  • how the URLs to fetch are detected
  • + *
  • how the data are extracted and stored
  • + *
* * @author code4crafter@gmail.com
* @see Site @@ -20,17 +23,20 @@ import us.codecraft.webmagic.Site; public interface PageProcessor { /** - * process the page, extract urls to fetch, extract the data and store + * Processes the page, extract URLs to fetch, extract the data and store. * * @param page page */ - public void process(Page page); + void process(Page page); /** - * get the site settings + * Returns the site settings. * * @return site * @see Site */ - public Site getSite(); + default Site getSite() { + return Site.me(); + } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java new file mode 100644 index 0000000..ebb1225 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.processor; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; + +public class PageProcessorTest { + + @Test + public void testGetSite() { + Site actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + }.getSite(); + + assertEquals(Site.me(), actualSite); + + actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + @Override + public Site getSite() { + return Site.me().setTimeOut(123); + }; + + }.getSite(); + + assertEquals(Site.me().setTimeOut(123), actualSite); + } + +} From 54da7af17eaffeb54360b4ed81639d84bd064281 Mon Sep 17 00:00:00 2001 From: David Hsing Date: Tue, 3 May 2022 17:42:42 +0800 Subject: [PATCH 05/16] change dependency versions into properties change dependency versions into properties update commons-collections from 3.x to 4.4 --- pom.xml | 78 ++++++++++++------- webmagic-core/pom.xml | 4 +- .../java/us/codecraft/webmagic/Spider.java | 26 ++++--- .../webmagic/selector/AbstractSelectable.java | 2 +- .../webmagic/selector/CssSelector.java | 8 +- .../webmagic/selector/JsonPathSelector.java | 20 +++-- .../webmagic/selector/XpathSelector.java | 6 +- .../downloader/HttpClientDownloaderTest.java | 31 +++++--- .../downloader/MockGithubDownloader.java | 8 +- .../codecraft/webmagic/model/PageMocker.java | 8 +- .../webmagic/samples/AngularJSProcessor.java | 6 +- .../samples/InfoQMiniBookProcessor.java | 2 +- .../webmagic/scripts/ScriptProcessor.java | 19 ++--- .../scripts/ScriptProcessorBuilder.java | 8 +- 14 files changed, 139 insertions(+), 87 deletions(-) diff --git a/pom.xml b/pom.xml index cda7ad1..3774b4b 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,31 @@ UTF-8 1.8 1.8 + 3.18.1 + 1.4 + 4.4 + 2.11.0 + 3.12.0 + 1.2.75 + 3.0.10 + 31.1-jre + 2.26 + 4.5.13 + 4.4.14 + 3.7.1 + 9.2.14.0 + 2.6.0 + 4.13.2 + 2.7.2 + 1.2.17 + 1.10.19 + 1.1.0 + 1.2.0 + 10.3 + 3.141.59 + 1.7.36 4.0.0.RELEASE + 0.3.2
webmagic-parent webmagic-parent @@ -58,59 +82,59 @@ junit junit - 4.13.1 + ${junit.version} test org.mockito mockito-all - 1.10.19 + ${mockito-all.version} test org.apache.httpcomponents httpclient - 4.5.13 + ${httpclient.version} org.apache.httpcomponents httpcore - 4.4.14 + ${httpcore.version} com.google.guava guava - 30.1-jre + ${guava.version} com.jayway.jsonpath json-path - 2.5.0 + ${json-path.version} org.slf4j slf4j-api - 1.7.30 + ${slf4j.version} org.slf4j slf4j-log4j12 - 1.7.30 + ${slf4j.version} us.codecraft xsoup - 0.3.2 + ${xsoup.version} com.alibaba fastjson - 1.2.75 + ${fastjson.version} com.github.dreamhead moco-core - 1.1.0 + ${moco.version} test @@ -122,73 +146,73 @@ log4j log4j - 1.2.17 + ${log4j.version} org.assertj assertj-core - 3.18.1 + ${assertj.version} test org.apache.commons commons-lang3 - 3.11 + ${commons-lang3.version} - commons-collections - commons-collections - 3.2.2 + org.apache.commons + commons-collections4 + ${commons-collections4.version} commons-io commons-io - 2.8.0 + ${commons-io.version} org.codehaus.groovy groovy-all - 3.0.7 + ${groovy-all.version} org.jruby jruby - 9.2.14.0 + ${jruby.version} org.python jython - 2.7.2 + ${jython.version} org.seleniumhq.selenium selenium-java - 3.141.59 + ${selenium-java.version} net.sf.saxon Saxon-HE - 10.3 + ${saxon-he.version} net.sourceforge.htmlcleaner htmlcleaner - 2.9 + ${htmlcleaner.version} com.github.detro phantomjsdriver - 1.2.0 + ${phantomjsdriver.version} commons-cli commons-cli - 1.4 + ${commons-cli.version} redis.clients jedis - 3.6.0 + ${jedis.version} diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 049477c..64b8013 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -52,8 +52,8 @@ - commons-collections - commons-collections + org.apache.commons + commons-collections4 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index bc8bb94..00091c9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,6 +1,20 @@ package us.codecraft.webmagic; -import org.apache.commons.collections.CollectionUtils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,16 +31,6 @@ import us.codecraft.webmagic.thread.CountableThreadPool; import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.WMCollections; -import java.io.Closeable; -import java.io.IOException; -import java.util.*; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; - /** * Entrance of a crawler.
* A spider contains four modules: Downloader, Scheduler, PageProcessor and diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index e2bb552..8775af1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -1,9 +1,9 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; import java.util.ArrayList; import java.util.List; +import org.apache.commons.collections4.CollectionUtils; /** * @author code4crafer@gmail.com diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 6a638db..cfe5547 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; -import java.util.ArrayList; -import java.util.List; - /** * CSS selector. Based on Jsoup. * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index f5c0bae..aa9a903 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -1,11 +1,11 @@ package us.codecraft.webmagic.selector; -import com.alibaba.fastjson.JSON; -import com.jayway.jsonpath.JsonPath; import java.util.ArrayList; import java.util.List; import java.util.Map; +import com.alibaba.fastjson.JSON; +import com.jayway.jsonpath.JsonPath; /** * JsonPath selector.
@@ -16,15 +16,20 @@ import java.util.Map; */ public class JsonPathSelector implements Selector { - private String jsonPathStr; + private final String jsonPathStr; - private JsonPath jsonPath; + private final JsonPath jsonPath; public JsonPathSelector(String jsonPathStr) { this.jsonPathStr = jsonPathStr; this.jsonPath = JsonPath.compile(this.jsonPathStr); } + @SuppressWarnings("unused") + public String getJsonPathStr() { + return jsonPathStr; + } + @Override public String select(String text) { Object object = jsonPath.read(text); @@ -32,8 +37,8 @@ public class JsonPathSelector implements Selector { return null; } if (object instanceof List) { - List list = (List) object; - if (list != null && list.size() > 0) { + List list = (List) object; + if (list.size() > 0) { return toString(list.iterator().next()); } } @@ -49,8 +54,9 @@ public class JsonPathSelector implements Selector { } @Override + @SuppressWarnings("unchecked") public List selectList(String text) { - List list = new ArrayList(); + List list = new ArrayList<>(); Object object = jsonPath.read(text); if (object == null) { return list; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 8a980a5..4fa1469 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; -import java.util.List; - /** * XPath selector based on Xsoup.
* diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index ece0600..780ca75 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,9 +1,10 @@ package us.codecraft.webmagic.downloader; -import com.github.dreamhead.moco.HttpServer; -import com.github.dreamhead.moco.Runnable; -import com.github.dreamhead.moco.Runner; -import org.apache.commons.collections.map.HashedMap; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Map; +import org.apache.commons.collections4.map.HashedMap; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; @@ -11,6 +12,9 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.junit.Test; +import com.github.dreamhead.moco.HttpServer; +import com.github.dreamhead.moco.Runnable; +import com.github.dreamhead.moco.Runner; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -21,12 +25,19 @@ import us.codecraft.webmagic.proxy.SimpleProxyProvider; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.Map; - -import static com.github.dreamhead.moco.Moco.*; +import static com.github.dreamhead.moco.Moco.and; +import static com.github.dreamhead.moco.Moco.by; +import static com.github.dreamhead.moco.Moco.cookie; +import static com.github.dreamhead.moco.Moco.eq; +import static com.github.dreamhead.moco.Moco.form; +import static com.github.dreamhead.moco.Moco.header; +import static com.github.dreamhead.moco.Moco.httpServer; +import static com.github.dreamhead.moco.Moco.method; +import static com.github.dreamhead.moco.Moco.not; +import static com.github.dreamhead.moco.Moco.query; +import static com.github.dreamhead.moco.Moco.text; +import static com.github.dreamhead.moco.Moco.uri; +import static com.github.dreamhead.moco.Moco.with; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 3aa742c..58dd3a6 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -1,13 +1,15 @@ package us.codecraft.webmagic.downloader; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; -import java.io.InputStream; /** * @author code4crafter@gmail.com @@ -19,7 +21,7 @@ public class MockGithubDownloader implements Downloader { Page page = new Page(); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); try { - page.setRawText(IOUtils.toString(resourceAsStream)); + page.setRawText(IOUtils.toString(resourceAsStream, Charset.defaultCharset())); } catch (IOException e) { e.printStackTrace(); } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java index 4b0c133..0451edc 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java @@ -1,11 +1,13 @@ package us.codecraft.webmagic.model; + +import java.io.IOException; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; /** * @author code4crafter@gmail.com @@ -16,7 +18,7 @@ public class PageMocker { public Page getMockJsonPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset())); page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); return page; @@ -24,7 +26,7 @@ public class PageMocker { public Page getMockPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset())); page.setRequest(new Request("http://webmagic.io/list/0")); page.setUrl(new PlainText("http://webmagic.io/list/0")); return page; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java index ab560e4..46476bb 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.JsonPathSelector; -import java.util.List; - /** * @author code4crafter@gmail.com * @since 0.5.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 280f8f1..33dd6aa 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java index 1822318..78c9d87 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java @@ -1,5 +1,14 @@ package us.codecraft.webmagic.scripts; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.Iterator; +import java.util.Map; +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptException; import org.apache.commons.io.IOUtils; import org.jruby.RubyHash; import org.python.core.PyDictionary; @@ -7,14 +16,6 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; -import javax.script.ScriptContext; -import javax.script.ScriptEngine; -import javax.script.ScriptException; -import java.io.IOException; -import java.io.InputStream; -import java.util.Iterator; -import java.util.Map; - /** * @author code4crafter@gmail.com * @since 0.4.1 @@ -39,7 +40,7 @@ public class ScriptProcessor implements PageProcessor { enginePool = new ScriptEnginePool(language, threadNum); InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(language.getDefineFile()); try { - defines = IOUtils.toString(resourceAsStream); + defines = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { throw new IllegalArgumentException(e); } diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java index 76b3e86..4691528 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.scripts; -import org.apache.commons.io.IOUtils; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; +import org.apache.commons.io.IOUtils; + /** * @author code4crafter@gmail.com @@ -35,7 +37,7 @@ public class ScriptProcessorBuilder { public ScriptProcessorBuilder scriptFromFile(String fileName) { try { InputStream resourceAsStream = new FileInputStream(fileName); - this.script = IOUtils.toString(resourceAsStream); + this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { //wrap IOException because I prefer a runtime exception... throw new IllegalArgumentException(e); @@ -46,7 +48,7 @@ public class ScriptProcessorBuilder { public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) { try { InputStream resourceAsStream = ScriptProcessor.class.getClassLoader().getResourceAsStream(fileName); - this.script = IOUtils.toString(resourceAsStream); + this.script = IOUtils.toString(resourceAsStream, Charset.defaultCharset()); } catch (IOException e) { //wrap IOException because I prefer a runtime exception... throw new IllegalArgumentException(e); From 16221e391d58b624fb777ff9725f15d83eabbc6b Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 5 Aug 2022 01:03:00 +0800 Subject: [PATCH 06/16] Fix xhtml namespace. --- .../java/us/codecraft/webmagic/selector/Xpath2Selector.java | 1 + .../java/us/codecraft/webmagic/selector/XpathSelectorTest.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 1f1f0a5..9d5eef9 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -75,6 +75,7 @@ public class Xpath2Selector implements Selector { private XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); put("xslt", NamespaceConstant.XSLT); + put("xhtml", NamespaceConstant.XHTML); } @Override diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 32906b5..1661883 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1376,7 +1376,7 @@ public class XpathSelectorTest { @Test public void testXpath2Selector() { - Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); + Xpath2Selector xpath2Selector = new Xpath2Selector("//xhtml:a/@href"); String select = xpath2Selector.select(html); Assert.assertEquals("http://www.oschina.net/", select); From d01f26333bb75561e80596932397512f83b177d4 Mon Sep 17 00:00:00 2001 From: "vio.ao" Date: Sat, 1 Oct 2022 00:21:17 +0800 Subject: [PATCH 07/16] Common the downloader status process and pass error information when onError --- .../downloader/AbstractDownloader.java | 4 +- .../downloader/HttpClientDownloader.java | 4 +- .../downloader/PhantomJSDownloader.java | 123 +++++------ .../selenium/SeleniumDownloader.java | 196 +++++++++--------- 4 files changed, 156 insertions(+), 171 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d..2f9b112 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable e) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e1..89b6038 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request, e); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader { String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb..88b8237 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,73 +16,70 @@ import java.io.*; * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - - private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default - private int retryNum; - private int threadNum; - public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + *

+ * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

      * crawl.js start --
-     * 
+     *
      *   var system = require('system');
      *   var url = system.args[1];
-     *   
+     *
      *   var page = require('webpage').create();
      *   page.settings.loadImages = false;
      *   page.settings.resourceTimeout = 5000;
-     *   
+     *
      *   page.open(url, function (status) {
      *       if (status != 'success') {
      *           console.log("HTTP request failed!");
      *       } else {
      *           console.log(page.content);
      *       }
-     *   
+     *
      *       page.close();
      *       phantom.exit();
      *   });
-     *   
+     *
      * -- crawl.js end
      * 
* 具体项目时可以将以上js代码复制下来使用 - * + *

* example: - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); - * + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * * @param phantomJsCommand phantomJsCommand - * @param crawlJsPath crawlJsPath + * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; - PhantomJSDownloader.crawlJsPath = crawlJsPath; + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; } - + private void initPhantomjsCrawlPath() { - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + + System.getProperty("file.separator") + "crawl.js "; } @Override @@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } - String content = getPage(request); - if (content.contains("HTTP request failed")) { - for (int i = 1; i <= getRetryNum(); i++) { - content = getPage(request); - if (!content.contains("HTTP request failed")) { - break; - } - } - if (content.contains("HTTP request failed")) { - //when failed - Page page = new Page(); - page.setRequest(request); - return page; - } - } - Page page = new Page(); - page.setRawText(content); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - page.setStatusCode(200); + Page page = Page.fail(); + try { + String content = getPage(request); + if (!content.contains("HTTP request failed")) { + page.setDownloadSuccess(true); + page.setRawText(content); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + page.setStatusCode(200); + } + onSuccess(request); + } catch (Exception e) { + onError(request, e); + logger.warn("download page {} error", request.getUrl(), e); + } return page; } @Override public void setThread(int threadNum) { - this.threadNum = threadNum; + // ignore } - protected String getPage(Request request) { - try { - String url = request.getUrl(); - Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); - InputStream is = process.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - StringBuffer stringBuffer = new StringBuffer(); - String line; - while ((line = br.readLine()) != null) { - stringBuffer.append(line).append("\n"); - } - return stringBuffer.toString(); - } catch (IOException e) { - e.printStackTrace(); + protected String getPage(Request request) throws Exception { + String url = request.getUrl(); + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); + InputStream is = process.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + StringBuilder builder = new StringBuilder(); + String line; + while ((line = br.readLine()) != null) { + builder.append(line).append("\n"); } - - return null; - } - - public int getRetryNum() { - return retryNum; - } - - public PhantomJSDownloader setRetryNum(int retryNum) { - this.retryNum = retryNum; - return this; + return builder.toString(); } } diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index cce293f..df601b4 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; @@ -24,112 +24,120 @@ import java.util.Map; * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/ -public class SeleniumDownloader implements Downloader, Closeable { +public class SeleniumDownloader extends AbstractDownloader implements Closeable { - private volatile WebDriverPool webDriverPool; + private volatile WebDriverPool webDriverPool; - private Logger logger = LoggerFactory.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); - private int sleepTime = 0; + private int sleepTime = 0; - private int poolSize = 1; + private int poolSize = 1; - private static final String DRIVER_PHANTOMJS = "phantomjs"; + private static final String DRIVER_PHANTOMJS = "phantomjs"; - /** - * 新建 - * - * @param chromeDriverPath chromeDriverPath - */ - public SeleniumDownloader(String chromeDriverPath) { - System.getProperties().setProperty("webdriver.chrome.driver", - chromeDriverPath); - } + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + } - /** - * Constructor without any filed. Construct PhantomJS browser - * - * @author bob.li.0718@gmail.com - */ - public SeleniumDownloader() { - // System.setProperty("phantomjs.binary.path", - // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); - } + /** + * Constructor without any filed. Construct PhantomJS browser + * + * @author bob.li.0718@gmail.com + */ + public SeleniumDownloader() { + // System.setProperty("phantomjs.binary.path", + // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); + } - /** - * set sleep time to wait until load success - * - * @param sleepTime sleepTime - * @return this - */ - public SeleniumDownloader setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - return this; - } + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } - @Override - public Page download(Request request, Task task) { - checkInit(); - WebDriver webDriver; - try { - webDriver = webDriverPool.get(); - } catch (InterruptedException e) { - logger.warn("interrupted", e); - return null; - } - logger.info("downloading page " + request.getUrl()); - webDriver.get(request.getUrl()); - try { - Thread.sleep(sleepTime); - } catch (InterruptedException e) { - e.printStackTrace(); - } - WebDriver.Options manage = webDriver.manage(); - Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies() - .entrySet()) { - Cookie cookie = new Cookie(cookieEntry.getKey(), - cookieEntry.getValue()); - manage.addCookie(cookie); - } - } + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver = null; + Page page = Page.fail(); + try { + webDriver = webDriverPool.get(); - /* - * TODO You can add mouse event or other processes - * - * @author: bob.li.0718@gmail.com - */ + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + if (sleepTime > 0) { + Thread.sleep(sleepTime); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } - WebElement webElement = webDriver.findElement(By.xpath("/html")); - String content = webElement.getAttribute("outerHTML"); - Page page = new Page(); - page.setRawText(content); - page.setHtml(new Html(content, request.getUrl())); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - webDriverPool.returnToPool(webDriver); - return page; - } + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ - private void checkInit() { - if (webDriverPool == null) { - synchronized (this) { - webDriverPool = new WebDriverPool(poolSize); - } - } - } + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + page.setDownloadSuccess(true); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + onSuccess(request); + } catch (Exception e) { + logger.warn("download page {} error", request.getUrl(), e); + onError(request, e); + } finally { + if (webDriver != null) { + webDriverPool.returnToPool(webDriver); + } + } + return page; + } - @Override - public void setThread(int thread) { - this.poolSize = thread; - } + private void checkInit() { + if (webDriverPool == null) { + synchronized (this) { + webDriverPool = new WebDriverPool(poolSize); + } + } + } - @Override - public void close() throws IOException { - webDriverPool.closeAll(); - } + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + webDriverPool.closeAll(); + } } From acfbd7b883436f2088ead0e5db95bcc1445769a5 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 1 Oct 2022 10:37:09 +0800 Subject: [PATCH 08/16] =?UTF-8?q?Revert=20"Common=20the=20downloader=20sta?= =?UTF-8?q?tus=20process=20and=20pass=20error=20information=20when=20?= =?UTF-8?q?=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../downloader/AbstractDownloader.java | 4 +- .../downloader/HttpClientDownloader.java | 4 +- .../downloader/PhantomJSDownloader.java | 121 ++++++----- .../selenium/SeleniumDownloader.java | 196 +++++++++--------- 4 files changed, 170 insertions(+), 155 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index 2f9b112..c27292d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } - protected void onError(Request request, Throwable e) { + protected void onError(Request request) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 89b6038..49217e1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, e); + onError(request); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader { String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()) { + if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 88b8237..6055bdb 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,70 +16,73 @@ import java.io.*; * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + + private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default + private int retryNum; + private int threadNum; + public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - *

- * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + * + * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

      * crawl.js start --
-     *
+     * 
      *   var system = require('system');
      *   var url = system.args[1];
-     *
+     *   
      *   var page = require('webpage').create();
      *   page.settings.loadImages = false;
      *   page.settings.resourceTimeout = 5000;
-     *
+     *   
      *   page.open(url, function (status) {
      *       if (status != 'success') {
      *           console.log("HTTP request failed!");
      *       } else {
      *           console.log(page.content);
      *       }
-     *
+     *   
      *       page.close();
      *       phantom.exit();
      *   });
-     *
+     *   
      * -- crawl.js end
      * 
* 具体项目时可以将以上js代码复制下来使用 - *

+ * * example: - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); - * + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * * @param phantomJsCommand phantomJsCommand - * @param crawlJsPath crawlJsPath + * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; - PhantomJSDownloader.crawlJsPath = crawlJsPath; + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; } - + private void initPhantomjsCrawlPath() { - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() - + System.getProperty("file.separator") + "crawl.js "; + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; } @Override @@ -87,41 +90,61 @@ public class PhantomJSDownloader extends AbstractDownloader { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } - - Page page = Page.fail(); - try { - String content = getPage(request); - if (!content.contains("HTTP request failed")) { - page.setDownloadSuccess(true); - page.setRawText(content); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - page.setStatusCode(200); + String content = getPage(request); + if (content.contains("HTTP request failed")) { + for (int i = 1; i <= getRetryNum(); i++) { + content = getPage(request); + if (!content.contains("HTTP request failed")) { + break; + } + } + if (content.contains("HTTP request failed")) { + //when failed + Page page = new Page(); + page.setRequest(request); + return page; } - onSuccess(request); - } catch (Exception e) { - onError(request, e); - logger.warn("download page {} error", request.getUrl(), e); } + + Page page = new Page(); + page.setRawText(content); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + page.setStatusCode(200); return page; } @Override public void setThread(int threadNum) { - // ignore + this.threadNum = threadNum; } - protected String getPage(Request request) throws Exception { - String url = request.getUrl(); - Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); - InputStream is = process.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - StringBuilder builder = new StringBuilder(); - String line; - while ((line = br.readLine()) != null) { - builder.append(line).append("\n"); + protected String getPage(Request request) { + try { + String url = request.getUrl(); + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); + InputStream is = process.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + StringBuffer stringBuffer = new StringBuffer(); + String line; + while ((line = br.readLine()) != null) { + stringBuffer.append(line).append("\n"); + } + return stringBuffer.toString(); + } catch (IOException e) { + e.printStackTrace(); } - return builder.toString(); + + return null; + } + + public int getRetryNum() { + return retryNum; + } + + public PhantomJSDownloader setRetryNum(int retryNum) { + this.retryNum = retryNum; + return this; } } diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index df601b4..cce293f 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.downloader.AbstractDownloader; +import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; @@ -24,120 +24,112 @@ import java.util.Map; * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/ -public class SeleniumDownloader extends AbstractDownloader implements Closeable { +public class SeleniumDownloader implements Downloader, Closeable { - private volatile WebDriverPool webDriverPool; + private volatile WebDriverPool webDriverPool; - private Logger logger = LoggerFactory.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); - private int sleepTime = 0; + private int sleepTime = 0; - private int poolSize = 1; + private int poolSize = 1; - private static final String DRIVER_PHANTOMJS = "phantomjs"; + private static final String DRIVER_PHANTOMJS = "phantomjs"; - /** - * 新建 - * - * @param chromeDriverPath chromeDriverPath - */ - public SeleniumDownloader(String chromeDriverPath) { - System.getProperties().setProperty("webdriver.chrome.driver", - chromeDriverPath); - } + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + } - /** - * Constructor without any filed. Construct PhantomJS browser - * - * @author bob.li.0718@gmail.com - */ - public SeleniumDownloader() { - // System.setProperty("phantomjs.binary.path", - // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); - } + /** + * Constructor without any filed. Construct PhantomJS browser + * + * @author bob.li.0718@gmail.com + */ + public SeleniumDownloader() { + // System.setProperty("phantomjs.binary.path", + // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); + } - /** - * set sleep time to wait until load success - * - * @param sleepTime sleepTime - * @return this - */ - public SeleniumDownloader setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - return this; - } + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } - @Override - public Page download(Request request, Task task) { - checkInit(); - WebDriver webDriver = null; - Page page = Page.fail(); - try { - webDriver = webDriverPool.get(); + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver; + try { + webDriver = webDriverPool.get(); + } catch (InterruptedException e) { + logger.warn("interrupted", e); + return null; + } + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } - logger.info("downloading page " + request.getUrl()); - webDriver.get(request.getUrl()); - try { - if (sleepTime > 0) { - Thread.sleep(sleepTime); - } - } catch (InterruptedException e) { - e.printStackTrace(); - } - WebDriver.Options manage = webDriver.manage(); - Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies() - .entrySet()) { - Cookie cookie = new Cookie(cookieEntry.getKey(), - cookieEntry.getValue()); - manage.addCookie(cookie); - } - } + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ - /* - * TODO You can add mouse event or other processes - * - * @author: bob.li.0718@gmail.com - */ + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + Page page = new Page(); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + webDriverPool.returnToPool(webDriver); + return page; + } - WebElement webElement = webDriver.findElement(By.xpath("/html")); - String content = webElement.getAttribute("outerHTML"); - page.setDownloadSuccess(true); - page.setRawText(content); - page.setHtml(new Html(content, request.getUrl())); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - onSuccess(request); - } catch (Exception e) { - logger.warn("download page {} error", request.getUrl(), e); - onError(request, e); - } finally { - if (webDriver != null) { - webDriverPool.returnToPool(webDriver); - } - } - return page; - } + private void checkInit() { + if (webDriverPool == null) { + synchronized (this) { + webDriverPool = new WebDriverPool(poolSize); + } + } + } - private void checkInit() { - if (webDriverPool == null) { - synchronized (this) { - webDriverPool = new WebDriverPool(poolSize); - } - } - } + @Override + public void setThread(int thread) { + this.poolSize = thread; + } - @Override - public void setThread(int thread) { - this.poolSize = thread; - } - - @Override - public void close() throws IOException { - webDriverPool.closeAll(); - } + @Override + public void close() throws IOException { + webDriverPool.closeAll(); + } } From 7a62a6cb45b02466bc343ad7c7d1984e6f831594 Mon Sep 17 00:00:00 2001 From: "vio.ao" Date: Sat, 1 Oct 2022 17:33:11 +0800 Subject: [PATCH 09/16] =?UTF-8?q?Revert=20"Revert=20"Common=20the=20downlo?= =?UTF-8?q?ader=20status=20process=20and=20pass=20error=20information=20wh?= =?UTF-8?q?en=20=E2=80=A6""?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit acfbd7b883436f2088ead0e5db95bcc1445769a5. --- .../downloader/AbstractDownloader.java | 4 +- .../downloader/HttpClientDownloader.java | 4 +- .../downloader/PhantomJSDownloader.java | 123 +++++------ .../selenium/SeleniumDownloader.java | 196 +++++++++--------- 4 files changed, 156 insertions(+), 171 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d..2f9b112 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable e) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e1..89b6038 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request, e); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader { String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb..88b8237 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,73 +16,70 @@ import java.io.*; * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - - private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default - private int retryNum; - private int threadNum; - public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + *

+ * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

      * crawl.js start --
-     * 
+     *
      *   var system = require('system');
      *   var url = system.args[1];
-     *   
+     *
      *   var page = require('webpage').create();
      *   page.settings.loadImages = false;
      *   page.settings.resourceTimeout = 5000;
-     *   
+     *
      *   page.open(url, function (status) {
      *       if (status != 'success') {
      *           console.log("HTTP request failed!");
      *       } else {
      *           console.log(page.content);
      *       }
-     *   
+     *
      *       page.close();
      *       phantom.exit();
      *   });
-     *   
+     *
      * -- crawl.js end
      * 
* 具体项目时可以将以上js代码复制下来使用 - * + *

* example: - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); - * + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * * @param phantomJsCommand phantomJsCommand - * @param crawlJsPath crawlJsPath + * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; - PhantomJSDownloader.crawlJsPath = crawlJsPath; + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; } - + private void initPhantomjsCrawlPath() { - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + + System.getProperty("file.separator") + "crawl.js "; } @Override @@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } - String content = getPage(request); - if (content.contains("HTTP request failed")) { - for (int i = 1; i <= getRetryNum(); i++) { - content = getPage(request); - if (!content.contains("HTTP request failed")) { - break; - } - } - if (content.contains("HTTP request failed")) { - //when failed - Page page = new Page(); - page.setRequest(request); - return page; - } - } - Page page = new Page(); - page.setRawText(content); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - page.setStatusCode(200); + Page page = Page.fail(); + try { + String content = getPage(request); + if (!content.contains("HTTP request failed")) { + page.setDownloadSuccess(true); + page.setRawText(content); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + page.setStatusCode(200); + } + onSuccess(request); + } catch (Exception e) { + onError(request, e); + logger.warn("download page {} error", request.getUrl(), e); + } return page; } @Override public void setThread(int threadNum) { - this.threadNum = threadNum; + // ignore } - protected String getPage(Request request) { - try { - String url = request.getUrl(); - Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); - InputStream is = process.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - StringBuffer stringBuffer = new StringBuffer(); - String line; - while ((line = br.readLine()) != null) { - stringBuffer.append(line).append("\n"); - } - return stringBuffer.toString(); - } catch (IOException e) { - e.printStackTrace(); + protected String getPage(Request request) throws Exception { + String url = request.getUrl(); + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); + InputStream is = process.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + StringBuilder builder = new StringBuilder(); + String line; + while ((line = br.readLine()) != null) { + builder.append(line).append("\n"); } - - return null; - } - - public int getRetryNum() { - return retryNum; - } - - public PhantomJSDownloader setRetryNum(int retryNum) { - this.retryNum = retryNum; - return this; + return builder.toString(); } } diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index cce293f..df601b4 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; @@ -24,112 +24,120 @@ import java.util.Map; * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/ -public class SeleniumDownloader implements Downloader, Closeable { +public class SeleniumDownloader extends AbstractDownloader implements Closeable { - private volatile WebDriverPool webDriverPool; + private volatile WebDriverPool webDriverPool; - private Logger logger = LoggerFactory.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); - private int sleepTime = 0; + private int sleepTime = 0; - private int poolSize = 1; + private int poolSize = 1; - private static final String DRIVER_PHANTOMJS = "phantomjs"; + private static final String DRIVER_PHANTOMJS = "phantomjs"; - /** - * 新建 - * - * @param chromeDriverPath chromeDriverPath - */ - public SeleniumDownloader(String chromeDriverPath) { - System.getProperties().setProperty("webdriver.chrome.driver", - chromeDriverPath); - } + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + } - /** - * Constructor without any filed. Construct PhantomJS browser - * - * @author bob.li.0718@gmail.com - */ - public SeleniumDownloader() { - // System.setProperty("phantomjs.binary.path", - // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); - } + /** + * Constructor without any filed. Construct PhantomJS browser + * + * @author bob.li.0718@gmail.com + */ + public SeleniumDownloader() { + // System.setProperty("phantomjs.binary.path", + // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); + } - /** - * set sleep time to wait until load success - * - * @param sleepTime sleepTime - * @return this - */ - public SeleniumDownloader setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - return this; - } + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } - @Override - public Page download(Request request, Task task) { - checkInit(); - WebDriver webDriver; - try { - webDriver = webDriverPool.get(); - } catch (InterruptedException e) { - logger.warn("interrupted", e); - return null; - } - logger.info("downloading page " + request.getUrl()); - webDriver.get(request.getUrl()); - try { - Thread.sleep(sleepTime); - } catch (InterruptedException e) { - e.printStackTrace(); - } - WebDriver.Options manage = webDriver.manage(); - Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies() - .entrySet()) { - Cookie cookie = new Cookie(cookieEntry.getKey(), - cookieEntry.getValue()); - manage.addCookie(cookie); - } - } + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver = null; + Page page = Page.fail(); + try { + webDriver = webDriverPool.get(); - /* - * TODO You can add mouse event or other processes - * - * @author: bob.li.0718@gmail.com - */ + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + if (sleepTime > 0) { + Thread.sleep(sleepTime); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } - WebElement webElement = webDriver.findElement(By.xpath("/html")); - String content = webElement.getAttribute("outerHTML"); - Page page = new Page(); - page.setRawText(content); - page.setHtml(new Html(content, request.getUrl())); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - webDriverPool.returnToPool(webDriver); - return page; - } + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ - private void checkInit() { - if (webDriverPool == null) { - synchronized (this) { - webDriverPool = new WebDriverPool(poolSize); - } - } - } + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + page.setDownloadSuccess(true); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + onSuccess(request); + } catch (Exception e) { + logger.warn("download page {} error", request.getUrl(), e); + onError(request, e); + } finally { + if (webDriver != null) { + webDriverPool.returnToPool(webDriver); + } + } + return page; + } - @Override - public void setThread(int thread) { - this.poolSize = thread; - } + private void checkInit() { + if (webDriverPool == null) { + synchronized (this) { + webDriverPool = new WebDriverPool(poolSize); + } + } + } - @Override - public void close() throws IOException { - webDriverPool.closeAll(); - } + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + webDriverPool.closeAll(); + } } From 5751681c9126e3c9ea1daeece2dc3eba10a281e7 Mon Sep 17 00:00:00 2001 From: "vio.ao" Date: Sat, 1 Oct 2022 17:34:02 +0800 Subject: [PATCH 10/16] Common the downloader status process and pass error information when onError --- .../us/codecraft/webmagic/samples/PhantomJSPageProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java index 99d5fa8..ab53140 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java @@ -36,7 +36,7 @@ public class PhantomJSPageProcessor implements PageProcessor { } public static void main(String[] args) throws Exception { - PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3); + PhantomJSDownloader phantomDownloader = new PhantomJSDownloader(); CollectorPipeline collectorPipeline = new ResultItemsCollectorPipeline(); From e7a7fbeeeb6ebc1e8f2bc152d1b142f4e6590a10 Mon Sep 17 00:00:00 2001 From: "vio.ao" Date: Sat, 1 Oct 2022 18:23:15 +0800 Subject: [PATCH 11/16] Enhance Jsoup could parse tr td tag directly --- .../selector/BaseElementSelector.java | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index bbc7217..b267d5b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.util.ArrayList; @@ -11,11 +12,24 @@ import java.util.List; * @since 0.3.0 */ public abstract class BaseElementSelector implements Selector, ElementSelector { + private Document parse(String text) { + if (text == null) { + return null; + } + + // Jsoup could not parse or tag directly + // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + if ((text.startsWith("") && text.endsWith("")) + || (text.startsWith("") && text.endsWith(""))) { + text = "" + text + "
"; + } + return Jsoup.parse(text); + } @Override public String select(String text) { if (text != null) { - return select(Jsoup.parse(text)); + return select(parse(text)); } return null; } @@ -23,7 +37,7 @@ public abstract class BaseElementSelector implements Selector, ElementSelector { @Override public List selectList(String text) { if (text != null) { - return selectList(Jsoup.parse(text)); + return selectList(parse(text)); } else { return new ArrayList(); } @@ -31,14 +45,14 @@ public abstract class BaseElementSelector implements Selector, ElementSelector { public Element selectElement(String text) { if (text != null) { - return selectElement(Jsoup.parse(text)); + return selectElement(parse(text)); } return null; } public List selectElements(String text) { if (text != null) { - return selectElements(Jsoup.parse(text)); + return selectElements(parse(text)); } else { return new ArrayList(); } From afc8309409ec495a9dab81fac0ba31ca094c5da7 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 1 Oct 2022 23:34:01 +0800 Subject: [PATCH 12/16] Upgrade maven plugins and dependencies. --- pom.xml | 70 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/pom.xml b/pom.xml index 3774b4b..1beac05 100644 --- a/pom.xml +++ b/pom.xml @@ -9,31 +9,31 @@ UTF-8 1.8 1.8 - 3.18.1 - 1.4 + 3.23.1 + 1.5.0 4.4 2.11.0 3.12.0 - 1.2.75 - 3.0.10 + 2.0.14.graal + 3.0.13 31.1-jre 2.26 4.5.13 - 4.4.14 + 4.4.15 3.7.1 - 9.2.14.0 - 2.6.0 + 9.3.8.0 + 2.7.0 4.13.2 - 2.7.2 + 2.7.3 1.2.17 - 1.10.19 - 1.1.0 + 2.0.2-beta + 1.3.0 1.2.0 - 10.3 + 11.4 3.141.59 - 1.7.36 + 2.0.3 4.0.0.RELEASE - 0.3.2 + 0.3.5 webmagic-parent webmagic-parent @@ -222,7 +222,7 @@ org.apache.maven.plugins maven-enforcer-plugin - 3.0.0-M3 + 3.1.0 enforce-maven @@ -296,7 +296,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.2.0 + 3.4.1 UTF-8 WebMagic ${project.version} @@ -325,7 +325,7 @@ org.apache.maven.plugins maven-release-plugin - 3.0.0-M1 + 3.0.0-M6 org.jacoco @@ -360,77 +360,77 @@ org.apache.maven.plugins maven-clean-plugin - 3.1.0 + 3.2.0 org.apache.maven.plugins maven-compiler-plugin - 3.8.1 + 3.10.1 org.apache.maven.plugins maven-deploy-plugin - 3.0.0-M1 + 3.0.0 org.apache.maven.plugins maven-install-plugin - 3.0.0-M1 + 3.0.1 org.apache.maven.plugins maven-jar-plugin - 3.2.0 + 3.3.0 org.apache.maven.plugins maven-jxr-plugin - 3.1.1 + 3.3.0 org.apache.maven.plugins maven-pmd-plugin - 3.14.0 + 3.19.0 org.apache.maven.plugins maven-resources-plugin - 3.2.0 + 3.3.0 org.apache.maven.plugins maven-site-plugin - 3.9.1 + 4.0.0-M3 org.apache.maven.plugins maven-surefire-plugin - 3.0.0-M5 + 3.0.0-M7 org.apache.maven.plugins maven-surefire-report-plugin - 3.0.0-M5 + 3.0.0-M7 org.codehaus.mojo taglist-maven-plugin - 2.4 + 3.0.0 org.jacoco jacoco-maven-plugin - 0.8.7 + 0.8.8 com.amashchenko.maven.plugin gitflow-maven-plugin - 1.15.0 + 1.18.0 com.github.spotbugs spotbugs-maven-plugin - 4.2.3 + 4.7.2.0 @@ -477,7 +477,7 @@ org.apache.maven.plugins maven-source-plugin - 2.2.1 + 3.2.1 package @@ -491,7 +491,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.9.1 + 3.4.1 package @@ -505,7 +505,7 @@ org.apache.maven.plugins maven-gpg-plugin - 1.6 + 3.0.1 verify @@ -518,7 +518,7 @@ org.sonatype.plugins nexus-staging-maven-plugin - 1.6.8 + 1.6.13 true sonatype-nexus-staging From 126c32ecd0ae68ec5a6eddf48486a210a18a4615 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 16 Oct 2022 07:20:22 +0800 Subject: [PATCH 13/16] Fix compatible issue. --- .../src/main/java/us/codecraft/webmagic/SpiderListener.java | 5 +++-- .../us/codecraft/webmagic/downloader/AbstractDownloader.java | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java index 8f10e0e..b55ef3d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java @@ -8,13 +8,14 @@ package us.codecraft.webmagic; */ public interface SpiderListener { - public void onSuccess(Request request); + void onSuccess(Request request); /** * @deprecated Use {@link #onError(Request, Exception)} instead. */ @Deprecated - public void onError(Request request); + default void onError(Request request) { + } default void onError(Request request, Exception e) { this.onError(request); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index 2f9b112..eb3a3a3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -38,6 +38,11 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } + @Deprecated + protected void onError(Request request) { + this.onError(request, null); + } + protected void onError(Request request, Throwable e) { } From 5f80e02abd7093f66d798c44d46ff55cf75bb4c4 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 19 Oct 2022 22:08:38 +0800 Subject: [PATCH 14/16] Interrupt current thread. --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 00091c9..fd35f77 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -326,6 +326,7 @@ public class Spider implements Runnable, Task { Thread.sleep(emptySleepTime); continue; } catch (InterruptedException e) { + Thread.currentThread().interrupt(); break; } } @@ -493,6 +494,7 @@ public class Spider implements Runnable, Task { Thread.sleep(time); } catch (InterruptedException e) { logger.error("Thread interrupted when sleep",e); + Thread.currentThread().interrupt(); } } From d2b2eed9df619d85d9cb6d808f7ef4bdc0c50d5f Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 19 Oct 2022 22:10:04 +0800 Subject: [PATCH 15/16] Pass the task to onSuccess & onError. --- .../webmagic/downloader/AbstractDownloader.java | 16 ++++++++++++++-- .../downloader/HttpClientDownloader.java | 4 ++-- .../webmagic/downloader/PhantomJSDownloader.java | 4 ++-- .../downloader/selenium/SeleniumDownloader.java | 4 ++-- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index eb3a3a3..ea3bbc5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.downloader; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; /** @@ -35,15 +36,26 @@ public abstract class AbstractDownloader implements Downloader { return (Html) page.getHtml(); } + @Deprecated protected void onSuccess(Request request) { } + /** + * @since 0.7.6 + */ + protected void onSuccess(Request request, Task task) { + this.onSuccess(request); + } + @Deprecated protected void onError(Request request) { - this.onError(request, null); } - protected void onError(Request request, Throwable e) { + /** + * @since 0.7.6 + */ + protected void onError(Request request, Task task, Throwable e) { + this.onError(request); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 89b6038..f138b20 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -82,12 +82,12 @@ public class HttpClientDownloader extends AbstractDownloader { try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); - onSuccess(request); + onSuccess(request, task); logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, e); + onError(request, task, e); return page; } finally { if (httpResponse != null) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 88b8237..4f1eee8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -98,9 +98,9 @@ public class PhantomJSDownloader extends AbstractDownloader { page.setRequest(request); page.setStatusCode(200); } - onSuccess(request); + onSuccess(request, task); } catch (Exception e) { - onError(request, e); + onError(request, task, e); logger.warn("download page {} error", request.getUrl(), e); } return page; diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index df601b4..39b3bc9 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -111,10 +111,10 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - onSuccess(request); + onSuccess(request, task); } catch (Exception e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, e); + onError(request, task, e); } finally { if (webDriver != null) { webDriverPool.returnToPool(webDriver); From 838c47f1f6a6274f0d18f432c00729957be0e90d Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 23 Oct 2022 23:58:21 +0800 Subject: [PATCH 16/16] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 1beac05..2394459 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 64b8013..fe1ff12 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e6e6068..289d275 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.6-SNAPSHOT + 0.7.6 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 741b081..fc5d9b7 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index c5582c0..6b3af83 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index d4d3efa..893fc0b 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index fe4ef68..80b9eef 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index be36376..36ded00 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6-SNAPSHOT + 0.7.6 4.0.0