Refactored Code to increase maintainability (#1152)
* Initial Commit * Assignment 1 Submission * Resolving Implementation Smells * Refactoring Code to increase maintainabilitymaster
parent
28ac8bf9c4
commit
9b9f173c1c
|
@ -9,11 +9,8 @@ import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.concurrent.locks.Condition;
|
|
||||||
import java.util.concurrent.locks.ReentrantLock;
|
|
||||||
import org.apache.commons.collections4.CollectionUtils;
|
import org.apache.commons.collections4.CollectionUtils;
|
||||||
import org.apache.commons.lang3.SerializationUtils;
|
import org.apache.commons.lang3.SerializationUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -76,7 +73,7 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
protected String uuid;
|
protected String uuid;
|
||||||
|
|
||||||
protected Scheduler scheduler = new QueueScheduler();
|
protected SpiderScheduler scheduler;
|
||||||
|
|
||||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
@ -100,10 +97,6 @@ public class Spider implements Runnable, Task {
|
||||||
|
|
||||||
protected boolean destroyWhenExit = true;
|
protected boolean destroyWhenExit = true;
|
||||||
|
|
||||||
private ReentrantLock newUrlLock = new ReentrantLock();
|
|
||||||
|
|
||||||
private Condition newUrlCondition = newUrlLock.newCondition();
|
|
||||||
|
|
||||||
private List<SpiderListener> spiderListeners;
|
private List<SpiderListener> spiderListeners;
|
||||||
|
|
||||||
private final AtomicLong pageCount = new AtomicLong(0);
|
private final AtomicLong pageCount = new AtomicLong(0);
|
||||||
|
@ -131,6 +124,7 @@ public class Spider implements Runnable, Task {
|
||||||
public Spider(PageProcessor pageProcessor) {
|
public Spider(PageProcessor pageProcessor) {
|
||||||
this.pageProcessor = pageProcessor;
|
this.pageProcessor = pageProcessor;
|
||||||
this.site = pageProcessor.getSite();
|
this.site = pageProcessor.getSite();
|
||||||
|
this.scheduler = new SpiderScheduler(new QueueScheduler());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -186,15 +180,15 @@ public class Spider implements Runnable, Task {
|
||||||
/**
|
/**
|
||||||
* set scheduler for Spider
|
* set scheduler for Spider
|
||||||
*
|
*
|
||||||
* @param scheduler scheduler
|
* @param updateScheduler scheduler
|
||||||
* @return this
|
* @return this
|
||||||
* @see Scheduler
|
* @see Scheduler
|
||||||
* @since 0.2.1
|
* @since 0.2.1
|
||||||
*/
|
*/
|
||||||
public Spider setScheduler(Scheduler scheduler) {
|
public Spider setScheduler(Scheduler updateScheduler) {
|
||||||
checkIfRunning();
|
checkIfRunning();
|
||||||
Scheduler oldScheduler = this.scheduler;
|
SpiderScheduler oldScheduler = this.scheduler;
|
||||||
this.scheduler = scheduler;
|
scheduler.setScheduler(updateScheduler);
|
||||||
if (oldScheduler != null) {
|
if (oldScheduler != null) {
|
||||||
Request request;
|
Request request;
|
||||||
while ((request = oldScheduler.poll(this)) != null) {
|
while ((request = oldScheduler.poll(this)) != null) {
|
||||||
|
@ -333,8 +327,8 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// wait until new url added,
|
// wait until new url added,
|
||||||
if (waitNewUrl()) {
|
if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
|
||||||
//if interrupted
|
// if interrupted
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
@ -353,7 +347,7 @@ public class Spider implements Runnable, Task {
|
||||||
logger.error("process request " + request + " error", e);
|
logger.error("process request " + request + " error", e);
|
||||||
} finally {
|
} finally {
|
||||||
pageCount.incrementAndGet();
|
pageCount.incrementAndGet();
|
||||||
signalNewUrl();
|
scheduler.signalNewUrl();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -536,7 +530,7 @@ public class Spider implements Runnable, Task {
|
||||||
for (String url : urls) {
|
for (String url : urls) {
|
||||||
addRequest(new Request(url));
|
addRequest(new Request(url));
|
||||||
}
|
}
|
||||||
signalNewUrl();
|
scheduler.signalNewUrl();
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -588,42 +582,10 @@ public class Spider implements Runnable, Task {
|
||||||
for (Request request : requests) {
|
for (Request request : requests) {
|
||||||
addRequest(request);
|
addRequest(request);
|
||||||
}
|
}
|
||||||
signalNewUrl();
|
scheduler.signalNewUrl();
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return isInterrupted
|
|
||||||
*/
|
|
||||||
private boolean waitNewUrl() {
|
|
||||||
// now there may not be any thread live
|
|
||||||
newUrlLock.lock();
|
|
||||||
try {
|
|
||||||
//double check,unnecessary, unless very fast concurrent
|
|
||||||
if (threadPool.getThreadAlive() == 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
//wait for amount of time
|
|
||||||
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
|
||||||
return false;
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
// logger.warn("waitNewUrl - interrupted, error {}", e);
|
|
||||||
return true;
|
|
||||||
} finally {
|
|
||||||
newUrlLock.unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void signalNewUrl() {
|
|
||||||
try {
|
|
||||||
newUrlLock.lock();
|
|
||||||
newUrlCondition.signalAll();
|
|
||||||
} finally {
|
|
||||||
newUrlLock.unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void start() {
|
public void start() {
|
||||||
runAsync();
|
runAsync();
|
||||||
}
|
}
|
||||||
|
@ -799,7 +761,7 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Scheduler getScheduler() {
|
public Scheduler getScheduler() {
|
||||||
return scheduler;
|
return scheduler.getScheduler();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.locks.Condition;
|
||||||
|
import java.util.concurrent.locks.ReentrantLock;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||||
|
import us.codecraft.webmagic.thread.CountableThreadPool;
|
||||||
|
|
||||||
|
public class SpiderScheduler {
|
||||||
|
private Scheduler scheduler;
|
||||||
|
private final ReentrantLock newUrlLock = new ReentrantLock();
|
||||||
|
private final Condition newUrlCondition = newUrlLock.newCondition();
|
||||||
|
|
||||||
|
public SpiderScheduler(Scheduler scheduler) {
|
||||||
|
this.scheduler = scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Scheduler getScheduler() {
|
||||||
|
return scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setScheduler(Scheduler scheduler) {
|
||||||
|
this.scheduler = scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Request poll(Spider spider) {
|
||||||
|
return scheduler.poll(spider);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void push(Request request, Spider spider) {
|
||||||
|
scheduler.push(request, spider);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
|
||||||
|
newUrlLock.lock();
|
||||||
|
try {
|
||||||
|
if (threadPool.getThreadAlive() == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
|
||||||
|
return false;
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
return true;
|
||||||
|
} finally {
|
||||||
|
newUrlLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void signalNewUrl() {
|
||||||
|
try {
|
||||||
|
newUrlLock.lock();
|
||||||
|
newUrlCondition.signalAll();
|
||||||
|
} finally {
|
||||||
|
newUrlLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -26,7 +26,6 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
return elements;
|
return elements;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable smartContent() {
|
public Selectable smartContent() {
|
||||||
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||||
return select(smartContentSelector, getSourceTexts());
|
return select(smartContentSelector, getSourceTexts());
|
||||||
|
|
|
@ -42,11 +42,6 @@ public class PlainText extends AbstractSelectable {
|
||||||
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable smartContent() {
|
|
||||||
throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable links() {
|
public Selectable links() {
|
||||||
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
|
||||||
|
|
|
@ -51,14 +51,6 @@ public interface Selectable {
|
||||||
* @return new Selectable after extract
|
* @return new Selectable after extract
|
||||||
*/
|
*/
|
||||||
public Selectable css(String selector, String attrName);
|
public Selectable css(String selector, String attrName);
|
||||||
|
|
||||||
/**
|
|
||||||
* select smart content with ReadAbility algorithm
|
|
||||||
*
|
|
||||||
* @return content
|
|
||||||
*/
|
|
||||||
public Selectable smartContent();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* select all links
|
* select all links
|
||||||
*
|
*
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
package us.codecraft.webmagic.model.formatter;
|
||||||
|
|
||||||
|
public interface BasicClassDetector {
|
||||||
|
Class<?> detectBasicClass(Class<?> type);
|
||||||
|
}
|
||||||
|
|
||||||
|
class IntegerClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
||||||
|
return Integer.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class LongClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
||||||
|
return Long.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class DoubleClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
||||||
|
return Double.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class FloatClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
||||||
|
return Float.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class ShortClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
||||||
|
return Short.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class CharacterClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
||||||
|
return Character.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class ByteClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
||||||
|
return Byte.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class BooleanClassDetector implements BasicClassDetector {
|
||||||
|
@Override
|
||||||
|
public Class<?> detectBasicClass(Class<?> type) {
|
||||||
|
if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
||||||
|
return Boolean.class;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -24,28 +24,24 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract T formatTrimmed(String raw) throws Exception;
|
protected abstract T formatTrimmed(String raw) throws Exception;
|
||||||
|
|
||||||
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
|
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
|
||||||
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
|
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
|
||||||
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
|
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
|
||||||
|
public static final List<BasicClassDetector> basicClassDetector= Arrays.asList(new IntegerClassDetector(),
|
||||||
|
new LongClassDetector(),
|
||||||
|
new FloatClassDetector(),
|
||||||
|
new DoubleClassDetector(),
|
||||||
|
new ShortClassDetector(),
|
||||||
|
new ByteClassDetector(),
|
||||||
|
new BooleanClassDetector(),
|
||||||
|
new CharacterClassDetector());
|
||||||
|
|
||||||
public static Class<?> detectBasicClass(Class<?> type) {
|
public static Class<?> detectBasicClass(Class<?> type) {
|
||||||
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
|
for (BasicClassDetector detector : basicClassDetector) {
|
||||||
return Integer.class;
|
Class<?> detectedClass = detector.detectBasicClass(type);
|
||||||
} else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
|
if (detectedClass != null) {
|
||||||
return Long.class;
|
return detectedClass;
|
||||||
} else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
|
}
|
||||||
return Double.class;
|
|
||||||
} else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
|
|
||||||
return Float.class;
|
|
||||||
} else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
|
|
||||||
return Short.class;
|
|
||||||
} else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
|
|
||||||
return Character.class;
|
|
||||||
} else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
|
|
||||||
return Byte.class;
|
|
||||||
} else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
|
|
||||||
return Boolean.class;
|
|
||||||
}
|
}
|
||||||
return type;
|
return type;
|
||||||
}
|
}
|
||||||
|
@ -146,5 +142,4 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue