Refactored Code to increase maintainability (#1152)
* Initial Commit * Assignment 1 Submission * Resolving Implementation Smells * Refactoring Code to increase maintainabilitymaster
parent
28ac8bf9c4
commit
9b9f173c1c
|
@ -9,11 +9,8 @@ import java.util.Date;
|
|||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -76,7 +73,7 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
protected String uuid;
|
||||
|
||||
protected Scheduler scheduler = new QueueScheduler();
|
||||
protected SpiderScheduler scheduler;
|
||||
|
||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
|
@ -100,10 +97,6 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
protected boolean destroyWhenExit = true;
|
||||
|
||||
private ReentrantLock newUrlLock = new ReentrantLock();
|
||||
|
||||
private Condition newUrlCondition = newUrlLock.newCondition();
|
||||
|
||||
private List<SpiderListener> spiderListeners;
|
||||
|
||||
private final AtomicLong pageCount = new AtomicLong(0);
|
||||
|
@ -131,6 +124,7 @@ public class Spider implements Runnable, Task {
|
|||
public Spider(PageProcessor pageProcessor) {
|
||||
this.pageProcessor = pageProcessor;
|
||||
this.site = pageProcessor.getSite();
|
||||
this.scheduler = new SpiderScheduler(new QueueScheduler());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -186,15 +180,15 @@ public class Spider implements Runnable, Task {
|
|||
/**
|
||||
* set scheduler for Spider
|
||||
*
|
||||
* @param scheduler scheduler
|
||||
* @param updateScheduler scheduler
|
||||
* @return this
|
||||
* @see Scheduler
|
||||
* @since 0.2.1
|
||||
*/
|
||||
public Spider setScheduler(Scheduler scheduler) {
|
||||
public Spider setScheduler(Scheduler updateScheduler) {
|
||||
checkIfRunning();
|
||||
Scheduler oldScheduler = this.scheduler;
|
||||
this.scheduler = scheduler;
|
||||
SpiderScheduler oldScheduler = this.scheduler;
|
||||
scheduler.setScheduler(updateScheduler);
|
||||
if (oldScheduler != null) {
|
||||
Request request;
|
||||
while ((request = oldScheduler.poll(this)) != null) {
|
||||
|
@ -213,7 +207,7 @@ public class Spider implements Runnable, Task {
|
|||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public Spider pipeline(Pipeline pipeline) {
|
||||
public Spider pipeline(Pipeline pipeline) {
|
||||
return addPipeline(pipeline);
|
||||
}
|
||||
|
||||
|
@ -264,7 +258,7 @@ public class Spider implements Runnable, Task {
|
|||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public Spider downloader(Downloader downloader) {
|
||||
public Spider downloader(Downloader downloader) {
|
||||
return setDownloader(downloader);
|
||||
}
|
||||
|
||||
|
@ -333,10 +327,10 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
} else {
|
||||
// wait until new url added,
|
||||
if (waitNewUrl()) {
|
||||
//if interrupted
|
||||
if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
|
||||
// if interrupted
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -353,7 +347,7 @@ public class Spider implements Runnable, Task {
|
|||
logger.error("process request " + request + " error", e);
|
||||
} finally {
|
||||
pageCount.incrementAndGet();
|
||||
signalNewUrl();
|
||||
scheduler.signalNewUrl();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -536,7 +530,7 @@ public class Spider implements Runnable, Task {
|
|||
for (String url : urls) {
|
||||
addRequest(new Request(url));
|
||||
}
|
||||
signalNewUrl();
|
||||
scheduler.signalNewUrl();
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -588,42 +582,10 @@ public class Spider implements Runnable, Task {
|
|||
for (Request request : requests) {
|
||||
addRequest(request);
|
||||
}
|
||||
signalNewUrl();
|
||||
scheduler.signalNewUrl();
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return isInterrupted
|
||||
*/
|
||||
private boolean waitNewUrl() {
|
||||
// now there may not be any thread live
|
||||
newUrlLock.lock();
|
||||
try {
|
||||
//double check,unnecessary, unless very fast concurrent
|
||||
if (threadPool.getThreadAlive() == 0) {
|
||||
return false;
|
||||
}
|
||||
//wait for amount of time
|
||||
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
||||
return false;
|
||||
} catch (InterruptedException e) {
|
||||
// logger.warn("waitNewUrl - interrupted, error {}", e);
|
||||
return true;
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
private void signalNewUrl() {
|
||||
try {
|
||||
newUrlLock.lock();
|
||||
newUrlCondition.signalAll();
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
public void start() {
|
||||
runAsync();
|
||||
}
|
||||
|
@ -799,7 +761,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
|
||||
public Scheduler getScheduler() {
|
||||
return scheduler;
|
||||
return scheduler.getScheduler();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||
import us.codecraft.webmagic.thread.CountableThreadPool;
|
||||
|
||||
public class SpiderScheduler {
|
||||
private Scheduler scheduler;
|
||||
private final ReentrantLock newUrlLock = new ReentrantLock();
|
||||
private final Condition newUrlCondition = newUrlLock.newCondition();
|
||||
|
||||
public SpiderScheduler(Scheduler scheduler) {
|
||||
this.scheduler = scheduler;
|
||||
}
|
||||
|
||||
public Scheduler getScheduler() {
|
||||
return scheduler;
|
||||
}
|
||||
|
||||
public void setScheduler(Scheduler scheduler) {
|
||||
this.scheduler = scheduler;
|
||||
}
|
||||
|
||||
public Request poll(Spider spider) {
|
||||
return scheduler.poll(spider);
|
||||
}
|
||||
|
||||
public void push(Request request, Spider spider) {
|
||||
scheduler.push(request, spider);
|
||||
}
|
||||
|
||||
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
|
||||
newUrlLock.lock();
|
||||
try {
|
||||
if (threadPool.getThreadAlive() == 0) {
|
||||
return false;
|
||||
}
|
||||
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
||||
return false;
|
||||
} catch (InterruptedException e) {
|
||||
return true;
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
public void signalNewUrl() {
|
||||
try {
|
||||
newUrlLock.lock();
|
||||
newUrlCondition.signalAll();
|
||||
} finally {
|
||||
newUrlLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -26,7 +26,6 @@ public class HtmlNode extends AbstractSelectable {
|
|||
return elements;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable smartContent() {
|
||||
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||
return select(smartContentSelector, getSourceTexts());
|
||||
|
|
|
@ -42,11 +42,6 @@ public class PlainText extends AbstractSelectable {
|
|||
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable smartContent() {
|
||||
throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable links() {
|
||||
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||
|
|
|
@ -51,14 +51,6 @@ public interface Selectable {
|
|||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable css(String selector, String attrName);
|
||||
|
||||
/**
|
||||
* select smart content with ReadAbility algorithm
|
||||
*
|
||||
* @return content
|
||||
*/
|
||||
public Selectable smartContent();
|
||||
|
||||
/**
|
||||
* select all links
|
||||
*
|
||||
|
|
|
@ -0,0 +1,85 @@
|
|||
package us.codecraft.webmagic.model.formatter;
|
||||
|
||||
public interface BasicClassDetector {
|
||||
Class<?> detectBasicClass(Class<?> type);
|
||||
}
|
||||
|
||||
class IntegerClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
||||
return Integer.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class LongClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
||||
return Long.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class DoubleClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
||||
return Double.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class FloatClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
||||
return Float.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class ShortClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
||||
return Short.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class CharacterClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
||||
return Character.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class ByteClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
||||
return Byte.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
class BooleanClassDetector implements BasicClassDetector {
|
||||
@Override
|
||||
public Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
||||
return Boolean.class;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -24,28 +24,24 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
|||
}
|
||||
|
||||
protected abstract T formatTrimmed(String raw) throws Exception;
|
||||
|
||||
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
|
||||
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
|
||||
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
|
||||
public static final List<BasicClassDetector> basicClassDetector= Arrays.asList(new IntegerClassDetector(),
|
||||
new LongClassDetector(),
|
||||
new FloatClassDetector(),
|
||||
new DoubleClassDetector(),
|
||||
new ShortClassDetector(),
|
||||
new ByteClassDetector(),
|
||||
new BooleanClassDetector(),
|
||||
new CharacterClassDetector());
|
||||
|
||||
public static Class<?> detectBasicClass(Class<?> type) {
|
||||
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
||||
return Integer.class;
|
||||
} else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
||||
return Long.class;
|
||||
} else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
||||
return Double.class;
|
||||
} else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
||||
return Float.class;
|
||||
} else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
||||
return Short.class;
|
||||
} else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
||||
return Character.class;
|
||||
} else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
||||
return Byte.class;
|
||||
} else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
||||
return Boolean.class;
|
||||
for (BasicClassDetector detector : basicClassDetector) {
|
||||
Class<?> detectedClass = detector.detectBasicClass(type);
|
||||
if (detectedClass != null) {
|
||||
return detectedClass;
|
||||
}
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
@ -146,5 +142,4 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue