diff --git a/README.md b/README.md index cee747c..b23bf83 100644 --- a/README.md +++ b/README.md @@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.5.0 + 0.5.1 us.codecraft webmagic-extension - 0.5.0 + 0.5.1 ``` @@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) * [lidongyang](http://my.oschina.net/lidongyang) * [seveniu](https://github.com/seveniu) * [sebastian1118](https://github.com/sebastian1118) +* [codev777](https://github.com/codev777) ### 邮件组: diff --git a/en_docs/README.md b/en_docs/README.md index cc63925..5ae494c 100644 --- a/en_docs/README.md +++ b/en_docs/README.md @@ -25,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.5.0 + 0.5.1 us.codecraft webmagic-extension - 0.5.0 + 0.5.1 ``` @@ -145,6 +145,7 @@ Thanks these people for commiting source code, reporting bugs or suggesting for * [lidongyang](http://my.oschina.net/lidongyang) * [seveniu](https://github.com/seveniu) * [sebastian1118](https://github.com/sebastian1118) +* [codev777](https://github.com/codev777) ### Thanks: diff --git a/pom.xml b/pom.xml index 9bfc505..4ab38b1 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.5.0 + 0.5.2-SNAPSHOT 4.0.0 pom @@ -54,7 +54,7 @@ webmagic-selenium webmagic-saxon webmagic-samples - webmagic-avalon + diff --git a/webmagic-avalon/forger/pom.xml b/webmagic-avalon/forger/pom.xml index 89796d1..44b42f9 100644 --- a/webmagic-avalon/forger/pom.xml +++ b/webmagic-avalon/forger/pom.xml @@ -7,7 +7,7 @@ us.codecraft forger - 0.1.0 + 0.1.1-SNAPSHOT 4.0.0 jar diff --git a/webmagic-avalon/pom.xml b/webmagic-avalon/pom.xml index d1fadc4..0dbb369 100644 --- a/webmagic-avalon/pom.xml +++ b/webmagic-avalon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 @@ -39,12 +39,6 @@ 1.1.1 - - us.codecraft - forger - 0.1.1-SNAPSHOT - - org.freemarker freemarker diff --git a/webmagic-avalon/webmagic-admin/pom.xml b/webmagic-avalon/webmagic-admin/pom.xml index 020ca8a..ed364c1 100644 --- a/webmagic-avalon/webmagic-admin/pom.xml +++ b/webmagic-avalon/webmagic-admin/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml index 32eb8b4..9c7199a 100644 --- a/webmagic-avalon/webmagic-avalon-common/pom.xml +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 @@ -26,7 +26,7 @@ us.codecraft forger - 0.1.0 + 0.1.1-SNAPSHOT @@ -150,18 +150,4 @@ - - - sonatype-nexus-snapshots - Sonatype Nexus Snapshots - https://oss.sonatype.org/content/repositories/snapshots - - false - - - true - - - - diff --git a/webmagic-avalon/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml index f085c82..ebc5174 100644 --- a/webmagic-avalon/webmagic-worker/pom.xml +++ b/webmagic-avalon/webmagic-worker/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.0 + 0.5.1-SNAPSHOT 4.0.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 93ced05..6191b96 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.0 + 0.5.2-SNAPSHOT 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java similarity index 57% rename from webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java index 7b319b6..b14d8ee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicatedRemoveScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java @@ -4,6 +4,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; +import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; /** * Remove duplicate urls and only push urls which are not duplicate.

@@ -11,30 +13,30 @@ import us.codecraft.webmagic.Task; * @author code4crafer@gmail.com * @since 0.5.0 */ -public abstract class DuplicatedRemoveScheduler implements Scheduler { +public abstract class DuplicateRemovedScheduler implements Scheduler { protected Logger logger = LoggerFactory.getLogger(getClass()); + private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover(); + + public DuplicateRemover getDuplicateRemover() { + return duplicatedRemover; + } + + public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) { + this.duplicatedRemover = duplicatedRemover; + return this; + } + @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); - if (isDuplicate(request, task) || shouldReserved(request)) { + if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } } - /** - * Reset duplicate check. - */ - public abstract void resetDuplicateCheck(Task task); - - /** - * @param request - * @return - */ - protected abstract boolean isDuplicate(Request request, Task task); - protected boolean shouldReserved(Request request) { return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index 38c9b6c..8fa1b9e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -17,7 +17,7 @@ import java.util.concurrent.PriorityBlockingQueue; * @since 0.2.1 */ @ThreadSafe -public class PriorityScheduler extends LocalDuplicatedRemoveScheduler { +public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { public static final int INITIAL_CAPACITY = 5; @@ -65,4 +65,9 @@ public class PriorityScheduler extends LocalDuplicatedRemoveScheduler { public int getLeftRequestsCount(Task task) { return noPriorityQueue.size(); } + + @Override + public int getTotalRequestsCount(Task task) { + return getDuplicateRemover().getTotalRequestsCount(task); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 511d8a0..c38311f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -16,7 +16,7 @@ import java.util.concurrent.LinkedBlockingQueue; * @since 0.1.0 */ @ThreadSafe -public class QueueScheduler extends LocalDuplicatedRemoveScheduler { +public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { private BlockingQueue queue = new LinkedBlockingQueue(); @@ -34,4 +34,9 @@ public class QueueScheduler extends LocalDuplicatedRemoveScheduler { public int getLeftRequestsCount(Task task) { return queue.size(); } + + @Override + public int getTotalRequestsCount(Task task) { + return getDuplicateRemover().getTotalRequestsCount(task); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java new file mode 100644 index 0000000..6d5e597 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java @@ -0,0 +1,70 @@ +package us.codecraft.webmagic.scheduler.component; + +import com.google.common.hash.BloomFilter; +import com.google.common.hash.Funnels; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; + +import java.nio.charset.Charset; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * BloomFilterDuplicateRemover for huge number of urls. + * + * @author code4crafer@gmail.com + * @since 0.5.1 + */ +public class BloomFilterDuplicateRemover implements DuplicateRemover { + + private int expectedInsertions; + + private double fpp; + + private AtomicInteger counter; + + public BloomFilterDuplicateRemover(int expectedInsertions) { + this(expectedInsertions, 0.01); + } + + /** + * + * @param expectedInsertions the number of expected insertions to the constructed + * @param fpp the desired false positive probability (must be positive and less than 1.0) + */ + public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) { + this.expectedInsertions = expectedInsertions; + this.fpp = fpp; + this.bloomFilter = rebuildBloomFilter(); + } + + protected BloomFilter rebuildBloomFilter() { + counter = new AtomicInteger(0); + return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp); + } + + private final BloomFilter bloomFilter; + + @Override + public boolean isDuplicate(Request request, Task task) { + boolean isDuplicate = bloomFilter.mightContain(getUrl(request)); + if (!isDuplicate) { + bloomFilter.put(getUrl(request)); + counter.incrementAndGet(); + } + return isDuplicate; + } + + protected String getUrl(Request request) { + return request.getUrl(); + } + + @Override + public void resetDuplicateCheck(Task task) { + rebuildBloomFilter(); + } + + @Override + public int getTotalRequestsCount(Task task) { + return counter.get(); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java new file mode 100644 index 0000000..fa88976 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java @@ -0,0 +1,35 @@ +package us.codecraft.webmagic.scheduler.component; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; + +/** + * Remove duplicate requests. + * @author code4crafer@gmail.com + * @since 0.5.1 + */ +public interface DuplicateRemover { + /** + * + * Check whether the request is duplicate. + * + * @param request + * @param task + * @return + */ + public boolean isDuplicate(Request request, Task task); + + /** + * Reset duplicate check. + * @param task + */ + public void resetDuplicateCheck(Task task); + + /** + * Get TotalRequestsCount for monitor. + * @param task + * @return + */ + public int getTotalRequestsCount(Task task); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java similarity index 56% rename from webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java index c127c98..1190762 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/LocalDuplicatedRemoveScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.scheduler; +package us.codecraft.webmagic.scheduler.component; import com.google.common.collect.Sets; import us.codecraft.webmagic.Request; @@ -8,25 +8,26 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; /** - * Base Scheduler with duplicated urls removed by hash set.

- * - * @author code4crafter@gmail.com - * @since 0.5.0 + * @author code4crafer@gmail.com */ -public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler { +public class HashSetDuplicateRemover implements DuplicateRemover { private Set urls = Sets.newSetFromMap(new ConcurrentHashMap()); + @Override + public boolean isDuplicate(Request request, Task task) { + return !urls.add(getUrl(request)); + } + + protected String getUrl(Request request) { + return request.getUrl(); + } + @Override public void resetDuplicateCheck(Task task) { urls.clear(); } - @Override - protected boolean isDuplicate(Request request, Task task) { - return urls.add(request.getUrl()); - } - @Override public int getTotalRequestsCount(Task task) { return urls.size(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html new file mode 100644 index 0000000..213707c --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html @@ -0,0 +1,5 @@ + + +Component of scheduler. + + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java index e65e2f9..79b9efe 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java @@ -22,10 +22,10 @@ public class FilePersistentBase { } public void setPath(String path) { - this.path = path; if (!path.endsWith(PATH_SEPERATOR)) { path += PATH_SEPERATOR; } + this.path = path; } public File getFile(String fullName) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java new file mode 100644 index 0000000..f8e0b9a --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java @@ -0,0 +1,80 @@ +package us.codecraft.webmagic.scheduler; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; +import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafer@gmail.com + */ +public class BloomFilterDuplicateRemoverTest { + + @Test + public void testRemove() throws Exception { + BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10); + boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); + assertThat(isDuplicate).isFalse(); + isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); + assertThat(isDuplicate).isTrue(); + isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); + assertThat(isDuplicate).isFalse(); + isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); + assertThat(isDuplicate).isTrue(); + + } + + @Ignore("long time") + @Test + public void testMemory() throws Exception { + int times = 5000000; + DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005); + long freeMemory = Runtime.getRuntime().freeMemory(); + long time = System.currentTimeMillis(); + for (int i = 0; i < times; i++) { + duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); + } + System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time)); + System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory())); + + duplicateRemover = new HashSetDuplicateRemover(); + System.gc(); + freeMemory = Runtime.getRuntime().freeMemory(); + time = System.currentTimeMillis(); + for (int i = 0; i < times; i++) { + duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); + } + System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time)); + System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory())); + } + + @Ignore("long time") + @Test + public void testMissHit() throws Exception { + int times = 5000000; + DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01); + int right = 0; + int wrong = 0; + int missCheck = 0; + for (int i = 0; i < times; i++) { + boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); + if (duplicate) { + wrong++; + } else { + right++; + } + duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); + if (!duplicate) { + missCheck++; + } + } + + System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck); + } + + +} diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 5d93cdc..45a71cd 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.0 + 0.5.2-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 4215ab8..211b698 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -2,8 +2,6 @@ package us.codecraft.webmagic.scheduler; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.math.NumberUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; @@ -23,9 +21,7 @@ import java.util.concurrent.atomic.AtomicInteger; * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler { - - private Logger logger = LoggerFactory.getLogger(getClass()); +public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { private String filePath = System.getProperty("java.io.tmpdir"); @@ -166,4 +162,9 @@ public class FileCacheQueueScheduler extends LocalDuplicatedRemoveScheduler { public int getLeftRequestsCount(Task task) { return queue.size(); } + + @Override + public int getTotalRequestsCount(Task task) { + return getDuplicateRemover().getTotalRequestsCount(task); + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index dc2ee2e..bbb945c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -7,6 +7,7 @@ import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; /** * Use Redis as url scheduler for distributed crawlers.
@@ -14,7 +15,7 @@ import us.codecraft.webmagic.Task; * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class RedisScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler { +public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover { private JedisPool pool; @@ -25,11 +26,12 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor private static final String ITEM_PREFIX = "item_"; public RedisScheduler(String host) { - pool = new JedisPool(new JedisPoolConfig(), host); + this(new JedisPool(new JedisPoolConfig(), host)); } public RedisScheduler(JedisPool pool) { this.pool = pool; + setDuplicateRemover(this); } @Override @@ -43,10 +45,10 @@ public class RedisScheduler extends DuplicatedRemoveScheduler implements Monitor } @Override - protected boolean isDuplicate(Request request, Task task) { + public boolean isDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { - boolean isDuplicate = !jedis.sismember(getSetKey(task), request.getUrl()); + boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl()); if (!isDuplicate) { jedis.sadd(getSetKey(task), request.getUrl()); } diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 4769c21..df0ebfc 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.2-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java new file mode 100644 index 0000000..a980851 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Html; + +import java.util.List; + +/** + * @author code4crafer@gmail.com + */ +public class AmanzonPageProcessor implements PageProcessor{ + public void process(Page page) { + + Html html = page.getHtml(); + List questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all(); + + if(questionList != null && questionList.size() > 1) + { + //i=0是列名称,所以i从1开始 + for( int i = 1 ; i < questionList.size(); i++) + { + System.out.println(questionList.get(i)); + Html tempHtml = Html.create(""+questionList.get(i)+"
"); + String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString(); + System.out.println(comment); + String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString(); + System.out.println(answerNum); + String createTime = tempHtml.xpath("//td[3]/text()").toString(); + System.out.println(createTime); + + /* Document doc = Jsoup.parse(questionList.get(i)); + Html hmt = Html.create(questionList.get(i)) ; + String str = hmt.links().toString(); + String content = doc.getElementsByTag("a").text(); + String ss = doc.text();*/ + + } + } + + } + + @Override + public Site getSite() { + return Site.me(); + } + + public static void main(String[] args) { + Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java new file mode 100644 index 0000000..0aecb7b --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java @@ -0,0 +1,37 @@ +package us.codecraft.webmagic.samples; + +/** + * @author code4crafer@gmail.com + */ +public class GithubRepo { + + private String name; + + private String author; + + private String readme; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getAuthor() { + return author; + } + + public void setAuthor(String author) { + this.author = author; + } + + public String getReadme() { + return readme; + } + + public void setReadme(String readme) { + this.readme = readme; + } +} \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java new file mode 100644 index 0000000..0de61fb --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ * @since 0.5.1 + */ +public class GithubRepoPageProcessor implements PageProcessor { + + private Site site = Site.me().setRetryTimes(3).setSleepTime(0); + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); + GithubRepo githubRepo = new GithubRepo(); + githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString()); + if (githubRepo.getName() == null) { + //skip this page + page.setSkip(true); + } else { + page.putField("repo", githubRepo); + } + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index ded1a5f..2e3996c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -3,8 +3,12 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.QueueScheduler; +import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover; +import javax.management.JMException; import java.util.List; /** @@ -29,7 +33,9 @@ public class OschinaBlogPageProcesser implements PageProcessor { } - public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).run(); + public static void main(String[] args) throws JMException { + Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000))); + SpiderMonitor.instance().register(spider); + spider.run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java new file mode 100644 index 0000000..2458c8a --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java @@ -0,0 +1,7 @@ +package us.codecraft.webmagic.samples.pipeline; + +/** + * @author code4crafer@gmail.com + */ +public class ReplacePipeline { +} diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index a444d39..a45bcdc 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.2-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 2b846d8..fcf8d1a 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.2-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 038b371..a8c6707 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.0 + 0.5.2-SNAPSHOT 4.0.0 diff --git a/zh_docs/README.md b/zh_docs/README.md index cee747c..b23bf83 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.5.0 + 0.5.1 us.codecraft webmagic-extension - 0.5.0 + 0.5.1 ``` @@ -192,6 +192,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) * [lidongyang](http://my.oschina.net/lidongyang) * [seveniu](https://github.com/seveniu) * [sebastian1118](https://github.com/sebastian1118) +* [codev777](https://github.com/codev777) ### 邮件组: