add id
parent
632ca0ef83
commit
6428e20543
|
@ -12,13 +12,18 @@ public class Site {
|
|||
|
||||
private String domain;
|
||||
|
||||
/**
|
||||
* for identify a task
|
||||
*/
|
||||
private String identifier;
|
||||
|
||||
private String userAgent;
|
||||
|
||||
private Map<String,String> cookies = new LinkedHashMap<String, String>();
|
||||
private Map<String, String> cookies = new LinkedHashMap<String, String>();
|
||||
|
||||
private String encoding;
|
||||
|
||||
private List<String> startUrls;
|
||||
private List<String> startUrls = new ArrayList<String>();
|
||||
|
||||
private int sleepTime = 3000;
|
||||
|
||||
|
@ -34,8 +39,8 @@ public class Site {
|
|||
return new Site();
|
||||
}
|
||||
|
||||
public Site setCookie(String name,String value) {
|
||||
cookies.put(name,value);
|
||||
public Site setCookie(String name, String value) {
|
||||
cookies.put(name, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -44,7 +49,7 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Map<String,String> getCookies() {
|
||||
public Map<String, String> getCookies() {
|
||||
return cookies;
|
||||
}
|
||||
|
||||
|
@ -61,6 +66,15 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public Site setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getEncoding() {
|
||||
return encoding;
|
||||
}
|
||||
|
|
|
@ -85,7 +85,6 @@ public class Spider implements Runnable {
|
|||
Thread.sleep(time);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ public class FilePipeline implements Pipeline {
|
|||
public void process(Page page, Site site) {
|
||||
String domain = site.getDomain();
|
||||
domain = UrlUtils.getDomain(domain);
|
||||
String path = this.path + "" + domain + "/";
|
||||
String path = this.path + "" + domain + "#" + site.getIdentifier() + "/";
|
||||
File file = new File(path);
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
|
@ -46,7 +46,7 @@ public class FilePipeline implements Pipeline {
|
|||
}
|
||||
printWriter.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -60,7 +60,7 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
|
||||
private void init() {
|
||||
File file = new File(filePath);
|
||||
if (!file.exists()){
|
||||
if (!file.exists()) {
|
||||
file.mkdirs();
|
||||
}
|
||||
readFile();
|
||||
|
@ -81,8 +81,8 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
|
||||
private void initWriter() {
|
||||
try {
|
||||
fileUrlWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileUrlAllName, true));
|
||||
fileCursorWriter = new PrintWriter(new FileWriter(filePath + site.getDomain() + fileCursor, false));
|
||||
fileUrlWriter = new PrintWriter(new FileWriter(getFileName(fileUrlAllName), true));
|
||||
fileCursorWriter = new PrintWriter(new FileWriter(getFileName(fileCursor), false));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("init cache schedular error", e);
|
||||
}
|
||||
|
@ -100,7 +100,7 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
|
||||
private void readUrlFile() throws IOException {
|
||||
String line;
|
||||
BufferedReader fileUrlReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileUrlAllName));
|
||||
BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)));
|
||||
int lineReaded = 0;
|
||||
while ((line = fileUrlReader.readLine()) != null) {
|
||||
urls.add(line.trim());
|
||||
|
@ -112,7 +112,7 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
}
|
||||
|
||||
private void readCursorFile() throws IOException {
|
||||
BufferedReader fileCursorReader = new BufferedReader(new FileReader(filePath + site.getDomain() + fileCursor));
|
||||
BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
|
||||
String line = null;
|
||||
//read the last number
|
||||
while ((line = fileCursorReader.readLine()) != null) {
|
||||
|
@ -120,8 +120,12 @@ public class FileCacheQueueSchedular implements Schedular {
|
|||
}
|
||||
}
|
||||
|
||||
private String getFileName(String filename) {
|
||||
return filePath + site.getDomain() + "#" + site.getIdentifier() + filename;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void push(Request request,Site site) {
|
||||
public synchronized void push(Request request, Site site) {
|
||||
if (!inited.get()) {
|
||||
init();
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
|
@ -17,7 +16,6 @@ import java.io.IOException;
|
|||
*/
|
||||
public class DiaoyuwengProcessorTest {
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
||||
|
|
Loading…
Reference in New Issue