update java doc
parent
81e7f7982e
commit
96454fd74c
|
@ -101,7 +101,7 @@ public class Page {
|
|||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||
break;
|
||||
}
|
||||
s = UrlUtils.fixRelativeUrl(s, url.toString());
|
||||
s = UrlUtils.canonicalizeUrl(s, url.toString());
|
||||
targetRequests.add(new Request(s));
|
||||
}
|
||||
}
|
||||
|
@ -116,7 +116,7 @@ public class Page {
|
|||
return;
|
||||
}
|
||||
synchronized (targetRequests) {
|
||||
requestString = UrlUtils.fixRelativeUrl(requestString, url.toString());
|
||||
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
|
||||
targetRequests.add(new Request(requestString));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ public class HttpClientDownloader implements Downloader {
|
|||
//charset
|
||||
if (charset == null) {
|
||||
String value = httpResponse.getEntity().getContentType().getValue();
|
||||
charset = new PlainText(value).regex("charset=([^\\s]+)").toString();
|
||||
charset = UrlUtils.getCharset(value);
|
||||
}
|
||||
//
|
||||
handleGzip(httpResponse);
|
||||
|
@ -82,8 +82,8 @@ public class HttpClientDownloader implements Downloader {
|
|||
Header ceheader = httpResponse.getEntity().getContentEncoding();
|
||||
if (ceheader != null) {
|
||||
HeaderElement[] codecs = ceheader.getElements();
|
||||
for (int i = 0; i < codecs.length; i++) {
|
||||
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
|
||||
for (HeaderElement codec : codecs) {
|
||||
if (codec.getName().equalsIgnoreCase("gzip")) {
|
||||
httpResponse.setEntity(
|
||||
new GzipDecompressingEntity(httpResponse.getEntity()));
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ import us.codecraft.webmagic.selector.Selectable;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 命令行输出抽取结果。可用于测试。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:45
|
||||
|
|
|
@ -11,6 +11,7 @@ import java.io.IOException;
|
|||
import java.io.PrintWriter;
|
||||
|
||||
/**
|
||||
* 持久化到文件的接口。
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午6:28
|
||||
|
@ -21,10 +22,17 @@ public class FilePipeline implements Pipeline {
|
|||
|
||||
private Logger logger = Logger.getLogger(getClass());
|
||||
|
||||
/**
|
||||
* 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/"
|
||||
*/
|
||||
public FilePipeline() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 新建一个FilePipeline
|
||||
* @param path 文件保存路径
|
||||
*/
|
||||
public FilePipeline(String path) {
|
||||
this.path = path;
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:39
|
||||
|
|
|
@ -4,6 +4,8 @@ import us.codecraft.webmagic.Page;
|
|||
import us.codecraft.webmagic.Site;
|
||||
|
||||
/**
|
||||
* 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。<br>
|
||||
* extends the class to implements various spiders.<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午11:42
|
||||
|
@ -11,13 +13,13 @@ import us.codecraft.webmagic.Site;
|
|||
public interface PageProcessor {
|
||||
|
||||
/**
|
||||
* extends the class to implements variaty spiders
|
||||
* 定义如何处理页面,包括链接提取、内容抽取等。
|
||||
* @param page
|
||||
*/
|
||||
public void process(Page page);
|
||||
|
||||
/**
|
||||
* the site the processor for
|
||||
* 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。
|
||||
* @return site
|
||||
*/
|
||||
public Site getSite();
|
||||
|
|
|
@ -7,6 +7,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-22
|
||||
* Time: 下午9:15
|
||||
|
@ -22,6 +23,7 @@ public class SimplePageProcessor implements PageProcessor {
|
|||
public SimplePageProcessor(String startUrl, String urlPattern) {
|
||||
this.site = Site.me().addStartUrl(startUrl).
|
||||
setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA);
|
||||
//compile "*" expression to regex
|
||||
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
|
||||
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
|||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* 磁盘文件实现的安全Scheduler,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:13
|
||||
|
@ -91,6 +92,7 @@ public class FileCacheQueueScheduler implements Scheduler {
|
|||
readCursorFile();
|
||||
readUrlFile();
|
||||
} catch (IOException e) {
|
||||
logger.error("init file error",e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -109,7 +111,7 @@ public class FileCacheQueueScheduler implements Scheduler {
|
|||
|
||||
private void readCursorFile() throws IOException {
|
||||
BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
|
||||
String line = null;
|
||||
String line;
|
||||
//read the last number
|
||||
while ((line = fileCursorReader.readLine()) != null) {
|
||||
cursor = new AtomicInteger(NumberUtils.toInt(line));
|
||||
|
|
|
@ -10,6 +10,7 @@ import java.util.concurrent.BlockingQueue;
|
|||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
/**
|
||||
* 内存队列实现的线程安全Scheduler。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:13
|
||||
|
|
|
@ -4,14 +4,26 @@ import us.codecraft.webmagic.Request;
|
|||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* 包含url管理和调度的接口。包括url抓取队列,url去重等功能。<br>
|
||||
* Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:12
|
||||
*/
|
||||
public interface Scheduler {
|
||||
|
||||
/**
|
||||
* 加入一个待抓取的链接
|
||||
* @param request 待抓取的链接
|
||||
* @param task 定义的任务,以满足单Scheduler多Task的情况
|
||||
*/
|
||||
public void push(Request request,Task task);
|
||||
|
||||
/**
|
||||
* 返回下一个要抓取的链接
|
||||
* @param task 定义的任务,以满足单Scheduler多Task的情况
|
||||
* @return
|
||||
*/
|
||||
public Request poll(Task task);
|
||||
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
包含url管理和调度的接口Schedular及它的几个实现类。
|
||||
包含url管理和调度的接口Scheduler及它的几个实现类。
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -10,6 +10,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* css风格的选择器。包装了Jsoup。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午9:39
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
/**
|
||||
* 封装正则表达式抽取接口的类。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午7:39
|
||||
|
|
|
@ -18,7 +18,7 @@ public interface Selectable {
|
|||
public Selectable xpath(String xpath);
|
||||
|
||||
/**
|
||||
* select list with jquery selector
|
||||
* select list with css selector
|
||||
*
|
||||
* @param
|
||||
* @return
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* xpath的选择器。包装了HtmlCleaner。<br>
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 上午9:39
|
||||
|
@ -52,12 +53,12 @@ public class XpathSelector implements Selector {
|
|||
try {
|
||||
Object[] objects = tagNode.evaluateXPath(xpathStr);
|
||||
if (objects != null && objects.length >= 1) {
|
||||
for (int i = 0; i < objects.length; i++) {
|
||||
if (objects[i] instanceof TagNode) {
|
||||
TagNode tagNode1 = (TagNode) objects[i];
|
||||
for (Object object : objects) {
|
||||
if (object instanceof TagNode) {
|
||||
TagNode tagNode1 = (TagNode) object;
|
||||
results.add(htmlCleaner.getInnerHtml(tagNode1));
|
||||
} else {
|
||||
results.add(objects[i].toString());
|
||||
results.add(object.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,7 +14,13 @@ public class UrlUtils {
|
|||
|
||||
private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/");
|
||||
|
||||
public static String fixRelativeUrl(String url, String refer) {
|
||||
/**
|
||||
* 将url想对地址转化为绝对地址
|
||||
* @param url url地址
|
||||
* @param refer url地址来自哪个页面
|
||||
* @return
|
||||
*/
|
||||
public static String canonicalizeUrl(String url, String refer) {
|
||||
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
|
||||
return url;
|
||||
}
|
||||
|
@ -62,12 +68,12 @@ public class UrlUtils {
|
|||
|
||||
private static Pattern patternForProtocal = Pattern.compile("[\\w]+://");
|
||||
|
||||
public static String removeProtocal(String url) {
|
||||
public static String removeProtocol(String url) {
|
||||
return patternForProtocal.matcher(url).replaceAll("");
|
||||
}
|
||||
|
||||
public static String getDomain(String url) {
|
||||
String domain = removeProtocal(url);
|
||||
String domain = removeProtocol(url);
|
||||
int i = StringUtils.indexOf(domain, "/", 1);
|
||||
if (i > 0) {
|
||||
domain = StringUtils.substring(domain, 0, i);
|
||||
|
@ -84,7 +90,7 @@ public class UrlUtils {
|
|||
while (matcher.find()) {
|
||||
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
|
||||
stringBuilder.append(matcher.group(1));
|
||||
stringBuilder.append("\"" + fixRelativeUrl(matcher.group(2), url) + "\"");
|
||||
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
|
||||
lastEnd = matcher.end();
|
||||
}
|
||||
stringBuilder.append(StringUtils.substring(html, lastEnd));
|
||||
|
|
|
@ -12,18 +12,18 @@ public class UrlUtilsTest {
|
|||
|
||||
@Test
|
||||
public void testFixRelativeUrl() {
|
||||
String fixrelativeurl = UrlUtils.fixRelativeUrl("aa", "http://www.dianping.com/sh/ss/com");
|
||||
String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com");
|
||||
System.out.println("fix: " + fixrelativeurl);
|
||||
Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl);
|
||||
|
||||
fixrelativeurl = UrlUtils.fixRelativeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
||||
fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
|
||||
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
|
||||
|
||||
fixrelativeurl = UrlUtils.fixRelativeUrl("..../aa", "http://www.dianping.com/sh/ss/com");
|
||||
fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com");
|
||||
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
||||
fixrelativeurl = UrlUtils.fixRelativeUrl(".../aa", "http://www.dianping.com/sh/ss/com");
|
||||
fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com");
|
||||
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
|
||||
fixrelativeurl = UrlUtils.fixRelativeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
||||
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
|
||||
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
|
||||
// fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com");
|
||||
// System.out.println("fix: " + fixrelativeurl);
|
||||
|
@ -628,7 +628,6 @@ public class UrlUtilsTest {
|
|||
"\t\t\t<script src=\"http://discuz.gtimg.cn/cloud/scripts/discuz_tips.js?v=1\" type=\"text/javascript\" charset=\"UTF-8\"></script></body>\n" +
|
||||
"</html>\n";
|
||||
String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/");
|
||||
String text = "<a class=\"xu_subscribe\" href=\"home.php?mod=spacecp&ac=profile&op=info\" >订阅<span >虎嗅</span></a>";
|
||||
Assert.assertTrue(html.contains("<a href=\"article"));
|
||||
Assert.assertFalse(newHtml.contains("<a href=\"article"));
|
||||
}
|
||||
|
|
|
@ -14,6 +14,6 @@ public class FreemarkerPipelineTest {
|
|||
|
||||
@Test
|
||||
public void testTemplateLoad() throws IOException {
|
||||
FreemarkerPipeline freemarkerPipeline = new FreemarkerPipeline("wordpress.ftl");
|
||||
new FreemarkerPipeline("wordpress.ftl");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue