#424 remove guava dep and add fix docs
parent
c2531c6817
commit
00dfebbceb
10
pom.xml
10
pom.xml
|
@ -70,16 +70,16 @@
|
|||
<artifactId>httpclient</artifactId>
|
||||
<version>4.5.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
<version>0.8.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>15.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
<version>0.8.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
|
|
|
@ -20,11 +20,6 @@
|
|||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
|
@ -73,12 +68,6 @@
|
|||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>commons-lang</groupId>
|
||||
<artifactId>commons-lang</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
|
|
@ -1,13 +1,10 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import com.google.common.collect.HashBasedTable;
|
||||
import com.google.common.collect.Table;
|
||||
import org.apache.http.HttpHost;
|
||||
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
import us.codecraft.webmagic.proxy.SimpleProxyPool;
|
||||
import org.apache.http.auth.UsernamePasswordCredentials;
|
||||
import us.codecraft.webmagic.proxy.Proxy;
|
||||
import us.codecraft.webmagic.proxy.ProxyPool;
|
||||
import us.codecraft.webmagic.proxy.SimpleProxyPool;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -27,7 +24,7 @@ public class Site {
|
|||
|
||||
private Map<String, String> defaultCookies = new LinkedHashMap<String, String>();
|
||||
|
||||
private Table<String, String, String> cookies = HashBasedTable.create();
|
||||
private Map<String, Map<String, String>> cookies = new HashMap<String, Map<String, String>>();
|
||||
|
||||
private String charset;
|
||||
|
||||
|
@ -104,7 +101,10 @@ public class Site {
|
|||
* @return this
|
||||
*/
|
||||
public Site addCookie(String domain, String name, String value) {
|
||||
cookies.put(domain, name, value);
|
||||
if (!cookies.containsKey(domain)){
|
||||
cookies.put(domain,new HashMap<String, String>());
|
||||
}
|
||||
cookies.get(domain).put(name, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -134,7 +134,7 @@ public class Site {
|
|||
* @return get cookies
|
||||
*/
|
||||
public Map<String,Map<String, String>> getAllCookies() {
|
||||
return cookies.rowMap();
|
||||
return cookies;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -483,6 +483,7 @@ public class Site {
|
|||
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
|
||||
*
|
||||
* @param httpProxyList httpProxyList
|
||||
* @param isUseLastProxy isUseLastProxy
|
||||
* @return this
|
||||
*/
|
||||
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.http.HttpHost;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.downloader.Downloader;
|
||||
|
@ -16,6 +14,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler;
|
|||
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||
import us.codecraft.webmagic.thread.CountableThreadPool;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
@ -173,9 +172,9 @@ public class Spider implements Runnable, Task {
|
|||
*
|
||||
* @param scheduler scheduler
|
||||
* @return this
|
||||
* @Deprecated
|
||||
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
|
||||
*/
|
||||
@Deprecated
|
||||
public Spider scheduler(Scheduler scheduler) {
|
||||
return setScheduler(scheduler);
|
||||
}
|
||||
|
@ -499,7 +498,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
|
||||
public <T> T get(String url) {
|
||||
List<String> urls = Lists.newArrayList(url);
|
||||
List<String> urls = WMCollections.newArrayList(url);
|
||||
List<T> resultItemses = getAll(urls);
|
||||
if (resultItemses != null && resultItemses.size() > 0) {
|
||||
return resultItemses.get(0);
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.HttpHost;
|
||||
|
@ -28,6 +27,7 @@ import us.codecraft.webmagic.proxy.Proxy;
|
|||
import us.codecraft.webmagic.selector.PlainText;
|
||||
import us.codecraft.webmagic.utils.HttpConstant;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
|
@ -83,7 +83,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|||
charset = site.getCharset();
|
||||
headers = site.getHeaders();
|
||||
} else {
|
||||
acceptStatCode = Sets.newHashSet(200);
|
||||
acceptStatCode = WMCollections.newHashSet(200);
|
||||
}
|
||||
logger.info("downloading page {}", request.getUrl());
|
||||
CloseableHttpResponse httpResponse = null;
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
package us.codecraft.webmagic.scheduler.component;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
|
@ -12,7 +12,7 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
*/
|
||||
public class HashSetDuplicateRemover implements DuplicateRemover {
|
||||
|
||||
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
||||
private Set<String> urls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
||||
|
||||
@Override
|
||||
public boolean isDuplicate(Request request, Task task) {
|
||||
|
|
|
@ -33,11 +33,11 @@ public abstract class Selectors {
|
|||
}
|
||||
|
||||
/**
|
||||
* @Deprecated
|
||||
* @see #xpath(String)
|
||||
* @param expr expr
|
||||
* @return new selector
|
||||
*/
|
||||
@Deprecated
|
||||
public static XpathSelector xsoup(String expr) {
|
||||
return new XpathSelector(expr);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date: 16/12/18
|
||||
* Time: 上午10:16
|
||||
*/
|
||||
public class WMCollections {
|
||||
|
||||
public static <T> Set<T> newHashSet(T... t){
|
||||
Set<T> set = new HashSet<T>(t.length);
|
||||
for (T t1 : t) {
|
||||
set.add(t1);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
public static <T> List<T> newArrayList(T... t){
|
||||
List<T> set = new ArrayList<T>(t.length);
|
||||
for (T t1 : t) {
|
||||
set.add(t1);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
}
|
|
@ -15,6 +15,12 @@
|
|||
<artifactId>jedis</artifactId>
|
||||
<version>2.9.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>15.0</version>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
|
|
|
@ -2,7 +2,6 @@ package us.codecraft.webmagic.configurable;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-5
|
||||
*/
|
||||
public enum ExpressionType {
|
||||
|
||||
|
|
|
@ -7,7 +7,6 @@ import static us.codecraft.webmagic.selector.Selectors.*;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-5
|
||||
*/
|
||||
public class ExtractRule {
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|||
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
||||
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
||||
*
|
||||
* @param phantomJsCommand
|
||||
* @param phantomJsCommand phantomJsCommand
|
||||
*/
|
||||
public PhantomJSDownloader(String phantomJsCommand) {
|
||||
this.initPhantomjsCrawlPath();
|
||||
|
|
|
@ -9,7 +9,6 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-5
|
||||
*/
|
||||
public class CompositePageProcessor implements PageProcessor {
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-5
|
||||
*/
|
||||
public interface SubPageProcessor extends RequestMatcher {
|
||||
|
||||
|
|
|
@ -45,6 +45,7 @@ public class SpiderMonitor {
|
|||
*
|
||||
* @param spiders spiders
|
||||
* @return this
|
||||
* @throws JMException
|
||||
*/
|
||||
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
|
||||
for (Spider spider : spiders) {
|
||||
|
|
|
@ -1,9 +1,16 @@
|
|||
package us.codecraft.webmagic.scheduler.component;
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* Date: 16/12/18
|
||||
* Time: 上午10:23
|
||||
*/
|
||||
|
||||
import com.google.common.hash.BloomFilter;
|
||||
import com.google.common.hash.Funnels;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
|
@ -3,7 +3,6 @@ package us.codecraft.webmagic.scheduler;
|
|||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||
|
|
@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-9
|
||||
*/
|
||||
public class BaiduNews {
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-11
|
||||
*/
|
||||
@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*")
|
||||
@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true)
|
||||
|
|
|
@ -5,8 +5,8 @@ import us.codecraft.webmagic.Site;
|
|||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.monitor.SpiderMonitor;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
||||
|
||||
import javax.management.JMException;
|
||||
import java.util.List;
|
||||
|
|
|
@ -13,7 +13,7 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* Created by dolphineor on 2014-11-21.
|
||||
* <p/>
|
||||
* <p>
|
||||
* 以淘宝为例, 搜索冬装的相关结果
|
||||
*/
|
||||
public class PhantomJSPageProcessor implements PageProcessor {
|
||||
|
|
|
@ -19,9 +19,6 @@ public class OneFilePipeline extends FilePersistentBase implements Pipeline {
|
|||
|
||||
private PrintWriter printWriter;
|
||||
|
||||
/**
|
||||
* create a FilePipeline with default path"/data/webmagic/"
|
||||
*/
|
||||
public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
|
||||
this("/data/webmagic/");
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import org.apache.commons.cli.*;
|
||||
import org.apache.log4j.Level;
|
||||
import org.apache.log4j.Logger;
|
||||
|
@ -8,6 +7,7 @@ import us.codecraft.webmagic.ResultItems;
|
|||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
import us.codecraft.webmagic.utils.WMCollections;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
@ -29,8 +29,8 @@ public class ScriptConsole {
|
|||
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
|
||||
|
||||
static {
|
||||
alias.put(Language.JavaScript, Sets.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
|
||||
alias.put(Language.JRuby, Sets.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
|
||||
alias.put(Language.JavaScript, WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
|
||||
alias.put(Language.JRuby, WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
|
||||
}
|
||||
|
||||
public void setLanguagefromArg(String arg) {
|
||||
|
@ -93,7 +93,7 @@ public class ScriptConsole {
|
|||
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
|
||||
pageProcessor.getSite().setSleepTime(params.getSleepTime());
|
||||
pageProcessor.getSite().setRetryTimes(3);
|
||||
pageProcessor.getSite().setAcceptStatCode(Sets.<Integer>newHashSet(200, 404,403, 500,502));
|
||||
pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
|
||||
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
|
||||
spider.clearPipeline().addPipeline(new Pipeline() {
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue