diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index 5b1ceaf..b2dd3db 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -101,7 +101,7 @@ public class Page {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
break;
}
- s = UrlUtils.fixRelativeUrl(s, url.toString());
+ s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s));
}
}
@@ -116,7 +116,7 @@ public class Page {
return;
}
synchronized (targetRequests) {
- requestString = UrlUtils.fixRelativeUrl(requestString, url.toString());
+ requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString));
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index e4ae0ff..ac3ea0f 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -58,7 +58,7 @@ public class HttpClientDownloader implements Downloader {
//charset
if (charset == null) {
String value = httpResponse.getEntity().getContentType().getValue();
- charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
+ charset = UrlUtils.getCharset(value);
}
//
handleGzip(httpResponse);
@@ -82,8 +82,8 @@ public class HttpClientDownloader implements Downloader {
Header ceheader = httpResponse.getEntity().getContentEncoding();
if (ceheader != null) {
HeaderElement[] codecs = ceheader.getElements();
- for (int i = 0; i < codecs.length; i++) {
- if (codecs[i].getName().equalsIgnoreCase("gzip")) {
+ for (HeaderElement codec : codecs) {
+ if (codec.getName().equalsIgnoreCase("gzip")) {
httpResponse.setEntity(
new GzipDecompressingEntity(httpResponse.getEntity()));
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
index 6c2abba..dff2ded 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
@@ -7,6 +7,7 @@ import us.codecraft.webmagic.selector.Selectable;
import java.util.Map;
/**
+ * 命令行输出抽取结果。可用于测试。
* @author code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午1:45
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
index b079dcc..e48e2bb 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
@@ -11,6 +11,7 @@ import java.io.IOException;
import java.io.PrintWriter;
/**
+ * 持久化到文件的接口。
* @author code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午6:28
@@ -21,10 +22,17 @@ public class FilePipeline implements Pipeline {
private Logger logger = Logger.getLogger(getClass());
+ /**
+ * 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/"
+ */
public FilePipeline() {
}
+ /**
+ * 新建一个FilePipeline
+ * @param path 文件保存路径
+ */
public FilePipeline(String path) {
this.path = path;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
index 1be447c..408392d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
@@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Task;
/**
+ * Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。
* @author code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午1:39
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java
index c36ae98..3963d08 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java
@@ -4,6 +4,8 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
/**
+ * 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。
+ * extends the class to implements various spiders.
* @author code4crafter@gmail.com
* Date: 13-4-21
* Time: 上午11:42
@@ -11,13 +13,13 @@ import us.codecraft.webmagic.Site;
public interface PageProcessor {
/**
- * extends the class to implements variaty spiders
+ * 定义如何处理页面,包括链接提取、内容抽取等。
* @param page
*/
public void process(Page page);
/**
- * the site the processor for
+ * 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。
* @return site
*/
public Site getSite();
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
index 0d52446..47d3748 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
@@ -7,6 +7,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List;
/**
+ * 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。
* @author code4crafter@gmail.com
* Date: 13-4-22
* Time: 下午9:15
@@ -22,6 +23,7 @@ public class SimplePageProcessor implements PageProcessor {
public SimplePageProcessor(String startUrl, String urlPattern) {
this.site = Site.me().addStartUrl(startUrl).
setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA);
+ //compile "*" expression to regex
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java
index 1f5298a..77a6c0b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java
@@ -16,6 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
/**
+ * 磁盘文件实现的安全Scheduler,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。
* @author code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午1:13
@@ -91,6 +92,7 @@ public class FileCacheQueueScheduler implements Scheduler {
readCursorFile();
readUrlFile();
} catch (IOException e) {
+ logger.error("init file error",e);
}
}
@@ -109,7 +111,7 @@ public class FileCacheQueueScheduler implements Scheduler {
private void readCursorFile() throws IOException {
BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
- String line = null;
+ String line;
//read the last number
while ((line = fileCursorReader.readLine()) != null) {
cursor = new AtomicInteger(NumberUtils.toInt(line));
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java
index 6976885..613e406 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java
@@ -10,6 +10,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
/**
+ * 内存队列实现的线程安全Scheduler。
* @author code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午1:13
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java
index 7e02132..bf440ba 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java
@@ -4,14 +4,26 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
+ * 包含url管理和调度的接口。包括url抓取队列,url去重等功能。
+ * Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。
* @author code4crafter@gmail.com
* Date: 13-4-21
* Time: 下午1:12
*/
public interface Scheduler {
+ /**
+ * 加入一个待抓取的链接
+ * @param request 待抓取的链接
+ * @param task 定义的任务,以满足单Scheduler多Task的情况
+ */
public void push(Request request,Task task);
+ /**
+ * 返回下一个要抓取的链接
+ * @param task 定义的任务,以满足单Scheduler多Task的情况
+ * @return
+ */
public Request poll(Task task);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html
index 0e35610..7887dd5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html
@@ -1,5 +1,5 @@