update filecache to more useful
parent
7829c8fe02
commit
1148450ff9
|
@ -16,13 +16,11 @@ public class SimplePageProcessor implements PageProcessor {
|
||||||
|
|
||||||
private String urlPattern;
|
private String urlPattern;
|
||||||
|
|
||||||
private static final String UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31";
|
|
||||||
|
|
||||||
private Site site;
|
private Site site;
|
||||||
|
|
||||||
public SimplePageProcessor(String startUrl, String urlPattern) {
|
public SimplePageProcessor(String startUrl, String urlPattern) {
|
||||||
this.site = Site.me().addStartUrl(startUrl).
|
this.site = Site.me().addStartUrl(startUrl).
|
||||||
setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA);
|
setDomain(UrlUtils.getDomain(startUrl));
|
||||||
//compile "*" expression to regex
|
//compile "*" expression to regex
|
||||||
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
|
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import us.codecraft.webmagic.*;
|
||||||
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||||
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||||
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Download file and saved to file for cache.<br>
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* @since 0.2.1
|
||||||
|
*/
|
||||||
|
public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
|
||||||
|
|
||||||
|
private Downloader downloaderWhenFileMiss;
|
||||||
|
|
||||||
|
private final PageProcessor pageProcessor;
|
||||||
|
|
||||||
|
private Logger logger = Logger.getLogger(getClass());
|
||||||
|
|
||||||
|
public FileCache(String startUrl, String urlPattern) {
|
||||||
|
this(startUrl, urlPattern, "/data/webmagic/temp/");
|
||||||
|
}
|
||||||
|
|
||||||
|
public FileCache(String startUrl, String urlPattern, String path) {
|
||||||
|
this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
|
||||||
|
setPath(path);
|
||||||
|
downloaderWhenFileMiss = new HttpClientDownloader();
|
||||||
|
}
|
||||||
|
|
||||||
|
public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
|
||||||
|
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Page download(Request request, Task task) {
|
||||||
|
String path = this.path + "/" + task.getUUID() + "/";
|
||||||
|
Page page = null;
|
||||||
|
try {
|
||||||
|
final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
|
||||||
|
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
|
||||||
|
String line = bufferedReader.readLine();
|
||||||
|
if (line.equals("url:\t" + request.getUrl())) {
|
||||||
|
final String html = getHtml(bufferedReader);
|
||||||
|
page = new Page();
|
||||||
|
page.setRequest(request);
|
||||||
|
page.setUrl(PlainText.create(request.getUrl()));
|
||||||
|
page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
if (e instanceof FileNotFoundException) {
|
||||||
|
logger.info("File not exist for url " + request.getUrl());
|
||||||
|
} else {
|
||||||
|
logger.warn("File read error for url " + request.getUrl(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (page == null) {
|
||||||
|
page = downloadWhenMiss(request, task);
|
||||||
|
}
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setThread(int thread) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getHtml(BufferedReader bufferedReader) throws IOException {
|
||||||
|
String line;
|
||||||
|
StringBuilder htmlBuilder = new StringBuilder();
|
||||||
|
line = bufferedReader.readLine();
|
||||||
|
line = StringUtils.removeStart(line, "html:\t");
|
||||||
|
htmlBuilder.append(line);
|
||||||
|
while ((line = bufferedReader.readLine()) != null) {
|
||||||
|
htmlBuilder.append(line);
|
||||||
|
}
|
||||||
|
return htmlBuilder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Page downloadWhenMiss(Request request, Task task) {
|
||||||
|
Page page = null;
|
||||||
|
if (downloaderWhenFileMiss != null) {
|
||||||
|
page = downloaderWhenFileMiss.download(request, task);
|
||||||
|
}
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(ResultItems resultItems, Task task) {
|
||||||
|
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
|
||||||
|
try {
|
||||||
|
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
|
||||||
|
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||||
|
printWriter.println("html:\t" + resultItems.get("html"));
|
||||||
|
printWriter.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.warn("write file error", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
pageProcessor.process(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return pageProcessor.getSite();
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,97 +0,0 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.Request;
|
|
||||||
import us.codecraft.webmagic.Task;
|
|
||||||
import us.codecraft.webmagic.selector.Html;
|
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。<br>
|
|
||||||
* @author code4crafer@gmail.com
|
|
||||||
* Date: 13-6-24
|
|
||||||
* Time: 上午7:24
|
|
||||||
*/
|
|
||||||
public class FileDownloader implements Downloader {
|
|
||||||
|
|
||||||
private String path = "/data/temp/webmagic/";
|
|
||||||
|
|
||||||
private Downloader downloaderWhenFileMiss;
|
|
||||||
|
|
||||||
private Logger logger = Logger.getLogger(getClass());
|
|
||||||
|
|
||||||
public FileDownloader() {
|
|
||||||
this("/data/temp/webmagic/", null);
|
|
||||||
}
|
|
||||||
|
|
||||||
public FileDownloader(String path) {
|
|
||||||
this(path, null);
|
|
||||||
}
|
|
||||||
|
|
||||||
public FileDownloader(String path, Downloader downloaderWhenFileMiss) {
|
|
||||||
if (!path.endsWith("/")&&!path.endsWith("\\")){
|
|
||||||
path+="/";
|
|
||||||
}
|
|
||||||
this.path = path;
|
|
||||||
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Page download(Request request, Task task) {
|
|
||||||
String path = this.path + "/" + task.getUUID() + "/";
|
|
||||||
Page page = null;
|
|
||||||
try {
|
|
||||||
final File file = new File(path + DigestUtils.md5Hex(request.getUrl()));
|
|
||||||
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
|
|
||||||
String line = null;
|
|
||||||
line = bufferedReader.readLine();
|
|
||||||
if (line.equals("url:\t" + request.getUrl())) {
|
|
||||||
final String html = getHtml(bufferedReader);
|
|
||||||
page = new Page();
|
|
||||||
page.setRequest(request);
|
|
||||||
page.setUrl(PlainText.create(request.getUrl()));
|
|
||||||
page.setHtml(Html.create(html));
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
if (e instanceof FileNotFoundException) {
|
|
||||||
logger.info("File not exist for url " + request.getUrl());
|
|
||||||
} else {
|
|
||||||
logger.warn("File read error for url " + request.getUrl(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (page == null) {
|
|
||||||
page = downloadWhenMiss(request, task);
|
|
||||||
}
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setThread(int thread) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getHtml(BufferedReader bufferedReader) throws IOException {
|
|
||||||
String line;
|
|
||||||
StringBuilder htmlBuilder= new StringBuilder();
|
|
||||||
line = bufferedReader.readLine();
|
|
||||||
line = StringUtils.removeStart(line, "html:\t");
|
|
||||||
htmlBuilder.append(line);
|
|
||||||
while ((line=bufferedReader.readLine())!=null){
|
|
||||||
htmlBuilder.append(line);
|
|
||||||
}
|
|
||||||
return htmlBuilder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Page downloadWhenMiss(Request request, Task task) {
|
|
||||||
Page page = null;
|
|
||||||
if (downloaderWhenFileMiss != null) {
|
|
||||||
page = downloaderWhenFileMiss.download(request, task);
|
|
||||||
}
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
*/
|
||||||
|
public class FileCacheTest {
|
||||||
|
|
||||||
|
// @Ignore("takes long")
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*");
|
||||||
|
Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -4,7 +4,7 @@
|
||||||
<date-generated>Sat Aug 17 14:14:45 CST 2013</date-generated>
|
<date-generated>Sat Aug 17 14:14:45 CST 2013</date-generated>
|
||||||
</meta>
|
</meta>
|
||||||
<comment>
|
<comment>
|
||||||
<key><![CDATA[us.codecraft.webmagic.downloader.FileDownloader]]></key>
|
<key><![CDATA[us.codecraft.webmagic.downloader.FileCache]]></key>
|
||||||
<data><![CDATA[ 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。<br>
|
<data><![CDATA[ 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。<br>
|
||||||
@author code4crafer@gmail.com
|
@author code4crafer@gmail.com
|
||||||
Date: 13-6-24
|
Date: 13-6-24
|
||||||
|
|
Loading…
Reference in New Issue