From 00dfebbcebcf3ebe169b47887ea015d3f16eced4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 18 Dec 2016 10:45:50 +0800 Subject: [PATCH] #424 remove guava dep and add fix docs --- pom.xml | 10 +++---- webmagic-core/pom.xml | 11 ------- .../main/java/us/codecraft/webmagic/Site.java | 17 ++++++----- .../java/us/codecraft/webmagic/Spider.java | 7 ++--- .../downloader/HttpClientDownloader.java | 4 +-- .../component/HashSetDuplicateRemover.java | 4 +-- .../webmagic/selector/Selectors.java | 2 +- .../webmagic/utils/WMCollections.java | 30 +++++++++++++++++++ webmagic-extension/pom.xml | 6 ++++ .../webmagic/configurable/ExpressionType.java | 1 - .../webmagic/configurable/ExtractRule.java | 1 - .../downloader/PhantomJSDownloader.java | 2 +- .../handler/CompositePageProcessor.java | 1 - .../webmagic/handler/SubPageProcessor.java | 1 - .../webmagic/monitor/SpiderMonitor.java | 1 + .../BloomFilterDuplicateRemover.java | 11 +++++-- .../BloomFilterDuplicateRemoverTest.java | 1 - .../webmagic/model/samples/BaiduNews.java | 1 - .../webmagic/model/samples/QQMeishi.java | 1 - .../samples/OschinaBlogPageProcesser.java | 2 +- .../samples/PhantomJSPageProcessor.java | 2 +- .../samples/pipeline/OneFilePipeline.java | 3 -- .../webmagic/scripts/ScriptConsole.java | 8 ++--- 23 files changed, 75 insertions(+), 52 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java rename {webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component => webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler}/BloomFilterDuplicateRemover.java (90%) rename {webmagic-core => webmagic-extension}/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java (97%) diff --git a/pom.xml b/pom.xml index 52becfc..1555550 100644 --- a/pom.xml +++ b/pom.xml @@ -70,16 +70,16 @@ httpclient 4.5.2 - - com.jayway.jsonpath - json-path - 0.8.1 - com.google.guava guava 15.0 + + com.jayway.jsonpath + json-path + 0.8.1 + org.slf4j slf4j-api diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 6dc34e5..9b63a86 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -20,11 +20,6 @@ junit - - com.google.guava - guava - - org.apache.commons commons-lang3 @@ -73,12 +68,6 @@ com.jayway.jsonpath json-path - - - commons-lang - commons-lang - - diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index a111314..ac9f9ce 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,13 +1,10 @@ package us.codecraft.webmagic; -import com.google.common.collect.HashBasedTable; -import com.google.common.collect.Table; import org.apache.http.HttpHost; - -import us.codecraft.webmagic.proxy.Proxy; -import us.codecraft.webmagic.proxy.SimpleProxyPool; import org.apache.http.auth.UsernamePasswordCredentials; +import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyPool; +import us.codecraft.webmagic.proxy.SimpleProxyPool; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -27,7 +24,7 @@ public class Site { private Map defaultCookies = new LinkedHashMap(); - private Table cookies = HashBasedTable.create(); + private Map> cookies = new HashMap>(); private String charset; @@ -104,7 +101,10 @@ public class Site { * @return this */ public Site addCookie(String domain, String name, String value) { - cookies.put(domain, name, value); + if (!cookies.containsKey(domain)){ + cookies.put(domain,new HashMap()); + } + cookies.get(domain).put(name, value); return this; } @@ -134,7 +134,7 @@ public class Site { * @return get cookies */ public Map> getAllCookies() { - return cookies.rowMap(); + return cookies; } /** @@ -483,6 +483,7 @@ public class Site { * Set httpProxyPool, String[0]:ip, String[1]:port
* * @param httpProxyList httpProxyList + * @param isUseLastProxy isUseLastProxy * @return this */ public Site setHttpProxyPool(List httpProxyList, boolean isUseLastProxy) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a2be633..050eec0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,8 +1,6 @@ package us.codecraft.webmagic; -import com.google.common.collect.Lists; import org.apache.commons.collections.CollectionUtils; -import org.apache.http.HttpHost; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.downloader.Downloader; @@ -16,6 +14,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.thread.CountableThreadPool; import us.codecraft.webmagic.utils.UrlUtils; +import us.codecraft.webmagic.utils.WMCollections; import java.io.Closeable; import java.io.IOException; @@ -173,9 +172,9 @@ public class Spider implements Runnable, Task { * * @param scheduler scheduler * @return this - * @Deprecated * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) */ + @Deprecated public Spider scheduler(Scheduler scheduler) { return setScheduler(scheduler); } @@ -499,7 +498,7 @@ public class Spider implements Runnable, Task { } public T get(String url) { - List urls = Lists.newArrayList(url); + List urls = WMCollections.newArrayList(url); List resultItemses = getAll(urls); if (resultItemses != null && resultItemses.size() > 0) { return resultItemses.get(0); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 75d2511..0c4b06d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.downloader; -import com.google.common.collect.Sets; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHost; @@ -28,6 +27,7 @@ import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; +import us.codecraft.webmagic.utils.WMCollections; import java.io.IOException; import java.nio.charset.Charset; @@ -83,7 +83,7 @@ public class HttpClientDownloader extends AbstractDownloader { charset = site.getCharset(); headers = site.getHeaders(); } else { - acceptStatCode = Sets.newHashSet(200); + acceptStatCode = WMCollections.newHashSet(200); } logger.info("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java index 1190762..2c8a6d4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java @@ -1,9 +1,9 @@ package us.codecraft.webmagic.scheduler.component; -import com.google.common.collect.Sets; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import java.util.Collections; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -12,7 +12,7 @@ import java.util.concurrent.ConcurrentHashMap; */ public class HashSetDuplicateRemover implements DuplicateRemover { - private Set urls = Sets.newSetFromMap(new ConcurrentHashMap()); + private Set urls = Collections.newSetFromMap(new ConcurrentHashMap()); @Override public boolean isDuplicate(Request request, Task task) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index f63841b..7cd68c1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -33,11 +33,11 @@ public abstract class Selectors { } /** - * @Deprecated * @see #xpath(String) * @param expr expr * @return new selector */ + @Deprecated public static XpathSelector xsoup(String expr) { return new XpathSelector(expr); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java new file mode 100644 index 0000000..23e1644 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java @@ -0,0 +1,30 @@ +package us.codecraft.webmagic.utils; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * @author code4crafter@gmail.com + * Date: 16/12/18 + * Time: 上午10:16 + */ +public class WMCollections { + + public static Set newHashSet(T... t){ + Set set = new HashSet(t.length); + for (T t1 : t) { + set.add(t1); + } + return set; + } + + public static List newArrayList(T... t){ + List set = new ArrayList(t.length); + for (T t1 : t) { + set.add(t1); + } + return set; + } +} diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 054342f..bb4a815 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -15,6 +15,12 @@ jedis 2.9.0
+ + com.google.guava + guava + 15.0 + true + us.codecraft webmagic-core diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java index bd84be3..d873e65 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.configurable; /** * @author code4crafter@gmail.com - * @date 14-4-5 */ public enum ExpressionType { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java index 82337c4..bbc48dd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -7,7 +7,6 @@ import static us.codecraft.webmagic.selector.Selectors.*; /** * @author code4crafter@gmail.com - * @date 14-4-5 */ public class ExtractRule { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 2292788..88fa7c0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -37,7 +37,7 @@ public class PhantomJSDownloader extends AbstractDownloader { * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException * - * @param phantomJsCommand + * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java index 2073445..b7a39ed 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java @@ -9,7 +9,6 @@ import java.util.List; /** * @author code4crafter@gmail.com - * @date 14-4-5 */ public class CompositePageProcessor implements PageProcessor { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java index 12f62df..f7baad7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page; /** * @author code4crafter@gmail.com - * @date 14-4-5 */ public interface SubPageProcessor extends RequestMatcher { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index cae0d1d..a8aaecf 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -45,6 +45,7 @@ public class SpiderMonitor { * * @param spiders spiders * @return this + * @throws JMException */ public synchronized SpiderMonitor register(Spider... spiders) throws JMException { for (Spider spider : spiders) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemover.java similarity index 90% rename from webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemover.java index 6d5e597..db84302 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemover.java @@ -1,9 +1,16 @@ -package us.codecraft.webmagic.scheduler.component; +package us.codecraft.webmagic.scheduler; + +/** + * @author code4crafter@gmail.com + * Date: 16/12/18 + * Time: 上午10:23 + */ import com.google.common.hash.BloomFilter; import com.google.common.hash.Funnels; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import java.nio.charset.Charset; import java.util.concurrent.atomic.AtomicInteger; @@ -67,4 +74,4 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover { public int getTotalRequestsCount(Task task) { return counter.get(); } -} +} \ No newline at end of file diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java similarity index 97% rename from webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java index f8e0b9a..39c2b6a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.scheduler; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java index 4795662..e83d944 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java @@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy; /** * @author code4crafter@gmail.com - * @date 14-4-9 */ public class BaiduNews { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java index f4f8591..8120e35 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java @@ -8,7 +8,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com - * @date 14-4-11 */ @TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*") @ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true) diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 2e3996c..e6db04e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -5,8 +5,8 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; import us.codecraft.webmagic.scheduler.QueueScheduler; -import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover; import javax.management.JMException; import java.util.List; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java index b4f1936..99d5fa8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java @@ -13,7 +13,7 @@ import java.util.List; /** * Created by dolphineor on 2014-11-21. - *

+ *

* 以淘宝为例, 搜索冬装的相关结果 */ public class PhantomJSPageProcessor implements PageProcessor { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java index 9cb1bc2..4f38ecb 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java @@ -19,9 +19,6 @@ public class OneFilePipeline extends FilePersistentBase implements Pipeline { private PrintWriter printWriter; - /** - * create a FilePipeline with default path"/data/webmagic/" - */ public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException { this("/data/webmagic/"); } diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java index 57a923e..0423e58 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.scripts; -import com.google.common.collect.Sets; import org.apache.commons.cli.*; import org.apache.log4j.Level; import org.apache.log4j.Logger; @@ -8,6 +7,7 @@ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.utils.WMCollections; import java.util.HashMap; import java.util.List; @@ -29,8 +29,8 @@ public class ScriptConsole { private static Map> alias = new HashMap>(); static { - alias.put(Language.JavaScript, Sets.newHashSet("js", "javascript", "JavaScript", "JS")); - alias.put(Language.JRuby, Sets.newHashSet("ruby", "jruby", "Ruby", "JRuby")); + alias.put(Language.JavaScript, WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); + alias.put(Language.JRuby, WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); } public void setLanguagefromArg(String arg) { @@ -93,7 +93,7 @@ public class ScriptConsole { .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build(); pageProcessor.getSite().setSleepTime(params.getSleepTime()); pageProcessor.getSite().setRetryTimes(3); - pageProcessor.getSite().setAcceptStatCode(Sets.newHashSet(200, 404,403, 500,502)); + pageProcessor.getSite().setAcceptStatCode(WMCollections.newHashSet(200, 404,403, 500,502)); Spider spider = Spider.create(pageProcessor).thread(params.getThread()); spider.clearPipeline().addPipeline(new Pipeline() { @Override