Serialize request URL only in FileCacheQueueScheduler.

master
Sutra Zhou 2020-06-14 00:20:39 +08:00
parent 7945c0612d
commit 5d14efc50f
1 changed files with 17 additions and 20 deletions

View File

@ -1,14 +1,13 @@
package us.codecraft.webmagic.scheduler; package us.codecraft.webmagic.scheduler;
import org.apache.commons.codec.binary.Base64; import java.io.BufferedReader;
import org.apache.commons.io.IOUtils; import java.io.Closeable;
import org.apache.commons.lang3.SerializationUtils; import java.io.File;
import org.apache.commons.lang3.math.NumberUtils; import java.io.FileNotFoundException;
import us.codecraft.webmagic.Request; import java.io.FileReader;
import us.codecraft.webmagic.Task; import java.io.FileWriter;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import java.io.IOException;
import java.io.PrintWriter;
import java.io.*;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
import java.util.Set; import java.util.Set;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
@ -19,6 +18,13 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
/** /**
* Store urls and cursor in files so that a Spider can resume the status when shutdown.<br> * Store urls and cursor in files so that a Spider can resume the status when shutdown.<br>
@ -208,20 +214,11 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
} }
protected String serializeRequest(Request request) { protected String serializeRequest(Request request) {
String line = String.format("%1$s\t%2$s", request.getUrl(), return request.getUrl();
Base64.encodeBase64String(SerializationUtils.serialize(request)));
return line;
} }
protected Request deserializeRequest(String line) { protected Request deserializeRequest(String line) {
Request request; return new Request(line);
String[] sections = line.split("\t");
if (sections.length >= 2) {
request = (Request) SerializationUtils.deserialize(Base64.decodeBase64(sections[1]));
} else {
request = new Request(sections[0]);
}
return request;
} }
} }