diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 413d8d8..674ac5b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -12,13 +12,18 @@ public class Site { private String domain; + /** + * for identify a task + */ + private String identifier; + private String userAgent; - private Map cookies = new LinkedHashMap(); + private Map cookies = new LinkedHashMap(); private String encoding; - private List startUrls; + private List startUrls = new ArrayList(); private int sleepTime = 3000; @@ -34,8 +39,8 @@ public class Site { return new Site(); } - public Site setCookie(String name,String value) { - cookies.put(name,value); + public Site setCookie(String name, String value) { + cookies.put(name, value); return this; } @@ -44,7 +49,7 @@ public class Site { return this; } - public Map getCookies() { + public Map getCookies() { return cookies; } @@ -61,6 +66,15 @@ public class Site { return this; } + public String getIdentifier() { + return identifier; + } + + public Site setIdentifier(String identifier) { + this.identifier = identifier; + return this; + } + public String getEncoding() { return encoding; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 180d752..7f34850 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -85,7 +85,6 @@ public class Spider implements Runnable { Thread.sleep(time); } catch (InterruptedException e) { e.printStackTrace(); - ; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index ca3144e..84a94ce 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -33,7 +33,7 @@ public class FilePipeline implements Pipeline { public void process(Page page, Site site) { String domain = site.getDomain(); domain = UrlUtils.getDomain(domain); - String path = this.path + "" + domain + "/"; + String path = this.path + "" + domain + "#" + site.getIdentifier() + "/"; File file = new File(path); if (!file.exists()) { file.mkdirs(); @@ -46,7 +46,7 @@ public class FilePipeline implements Pipeline { } printWriter.close(); } catch (IOException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + e.printStackTrace(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java index 56c5f33..b3086a2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java @@ -60,7 +60,7 @@ public class FileCacheQueueSchedular implements Schedular { private void init() { File file = new File(filePath); - if (!file.exists()){ + if (!file.exists()) { file.mkdirs(); } readFile(); @@ -81,8 +81,8 @@ public class FileCacheQueueSchedular implements Schedular { private void initWriter() { try { - fileUrlWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileUrlAllName, true)); - fileCursorWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileCursor, false)); + fileUrlWriter = new PrintWriter(new FileWriter(getFileName(fileUrlAllName), true)); + fileCursorWriter = new PrintWriter(new FileWriter(getFileName(fileCursor), false)); } catch (IOException e) { throw new RuntimeException("init cache schedular error", e); } @@ -100,7 +100,7 @@ public class FileCacheQueueSchedular implements Schedular { private void readUrlFile() throws IOException { String line; - BufferedReader fileUrlReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileUrlAllName)); + BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName))); int lineReaded = 0; while ((line = fileUrlReader.readLine()) != null) { urls.add(line.trim()); @@ -112,7 +112,7 @@ public class FileCacheQueueSchedular implements Schedular { } private void readCursorFile() throws IOException { - BufferedReader fileCursorReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileCursor)); + BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); String line = null; //read the last number while ((line = fileCursorReader.readLine()) != null) { @@ -120,8 +120,12 @@ public class FileCacheQueueSchedular implements Schedular { } } + private String getFileName(String filename) { + return filePath + site.getDomain() + "#" + site.getIdentifier() + filename; + } + @Override - public synchronized void push(Request request,Site site) { + public synchronized void push(Request request, Site site) { if (!inited.get()) { init(); } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 012b5e6..0c24b57 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.processor; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; @@ -17,7 +16,6 @@ import java.io.IOException; */ public class DiaoyuwengProcessorTest { - @Ignore @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();