Refactored Code to increase maintainability (#1152)

* Initial Commit

* Assignment 1 Submission

* Resolving Implementation Smells

* Refactoring Code to increase maintainability
master
ayushi250317 2024-03-30 03:26:41 -03:00 committed by GitHub
parent 28ac8bf9c4
commit 9b9f173c1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 174 additions and 87 deletions

View File

@ -9,11 +9,8 @@ import java.util.Date;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
@ -75,9 +72,9 @@ public class Spider implements Runnable, Task {
protected Site site;
protected String uuid;
protected Scheduler scheduler = new QueueScheduler();
protected SpiderScheduler scheduler;
protected Logger logger = LoggerFactory.getLogger(getClass());
protected CountableThreadPool threadPool;
@ -100,10 +97,6 @@ public class Spider implements Runnable, Task {
protected boolean destroyWhenExit = true;
private ReentrantLock newUrlLock = new ReentrantLock();
private Condition newUrlCondition = newUrlLock.newCondition();
private List<SpiderListener> spiderListeners;
private final AtomicLong pageCount = new AtomicLong(0);
@ -131,6 +124,7 @@ public class Spider implements Runnable, Task {
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.scheduler = new SpiderScheduler(new QueueScheduler());
}
/**
@ -186,15 +180,15 @@ public class Spider implements Runnable, Task {
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @param updateScheduler scheduler
* @return this
* @see Scheduler
* @since 0.2.1
*/
public Spider setScheduler(Scheduler scheduler) {
public Spider setScheduler(Scheduler updateScheduler) {
checkIfRunning();
Scheduler oldScheduler = this.scheduler;
this.scheduler = scheduler;
SpiderScheduler oldScheduler = this.scheduler;
scheduler.setScheduler(updateScheduler);
if (oldScheduler != null) {
Request request;
while ((request = oldScheduler.poll(this)) != null) {
@ -213,7 +207,7 @@ public class Spider implements Runnable, Task {
* @deprecated
*/
@Deprecated
public Spider pipeline(Pipeline pipeline) {
public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}
@ -264,7 +258,7 @@ public class Spider implements Runnable, Task {
* @deprecated
*/
@Deprecated
public Spider downloader(Downloader downloader) {
public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
}
@ -333,10 +327,10 @@ public class Spider implements Runnable, Task {
}
} else {
// wait until new url added
if (waitNewUrl()) {
//if interrupted
if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
// if interrupted
break;
}
}
continue;
}
}
@ -353,7 +347,7 @@ public class Spider implements Runnable, Task {
logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
signalNewUrl();
scheduler.signalNewUrl();
}
}
});
@ -536,7 +530,7 @@ public class Spider implements Runnable, Task {
for (String url : urls) {
addRequest(new Request(url));
}
signalNewUrl();
scheduler.signalNewUrl();
return this;
}
@ -588,42 +582,10 @@ public class Spider implements Runnable, Task {
for (Request request : requests) {
addRequest(request);
}
signalNewUrl();
scheduler.signalNewUrl();
return this;
}
/**
*
* @return isInterrupted
*/
private boolean waitNewUrl() {
// now there may not be any thread live
newUrlLock.lock();
try {
//double checkunnecessary, unless very fast concurrent
if (threadPool.getThreadAlive() == 0) {
return false;
}
//wait for amount of time
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
// logger.warn("waitNewUrl - interrupted, error {}", e);
return true;
} finally {
newUrlLock.unlock();
}
}
private void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
public void start() {
runAsync();
}
@ -799,7 +761,7 @@ public class Spider implements Runnable, Task {
}
public Scheduler getScheduler() {
return scheduler;
return scheduler.getScheduler();
}
/**

View File

@ -0,0 +1,59 @@
package us.codecraft.webmagic;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
public class SpiderScheduler {
private Scheduler scheduler;
private final ReentrantLock newUrlLock = new ReentrantLock();
private final Condition newUrlCondition = newUrlLock.newCondition();
public SpiderScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Scheduler getScheduler() {
return scheduler;
}
public void setScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Request poll(Spider spider) {
return scheduler.poll(spider);
}
public void push(Request request, Spider spider) {
scheduler.push(request, spider);
}
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
newUrlLock.lock();
try {
if (threadPool.getThreadAlive() == 0) {
return false;
}
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
return true;
} finally {
newUrlLock.unlock();
}
}
public void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
}

View File

@ -26,7 +26,6 @@ public class HtmlNode extends AbstractSelectable {
return elements;
}
@Override
public Selectable smartContent() {
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, getSourceTexts());

View File

@ -42,11 +42,6 @@ public class PlainText extends AbstractSelectable {
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}
@Override
public Selectable smartContent() {
throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}
@Override
public Selectable links() {
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");

View File

@ -51,14 +51,6 @@ public interface Selectable {
* @return new Selectable after extract
*/
public Selectable css(String selector, String attrName);
/**
* select smart content with ReadAbility algorithm
*
* @return content
*/
public Selectable smartContent();
/**
* select all links
*

View File

@ -0,0 +1,85 @@
package us.codecraft.webmagic.model.formatter;
public interface BasicClassDetector {
Class<?> detectBasicClass(Class<?> type);
}
class IntegerClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
return Integer.class;
}
return null;
}
}
class LongClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Long.TYPE) || type.equals(Long.class)) {
return Long.class;
}
return null;
}
}
class DoubleClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Double.TYPE) || type.equals(Double.class)) {
return Double.class;
}
return null;
}
}
class FloatClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Float.TYPE) || type.equals(Float.class)) {
return Float.class;
}
return null;
}
}
class ShortClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Short.TYPE) || type.equals(Short.class)) {
return Short.class;
}
return null;
}
}
class CharacterClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Character.TYPE) || type.equals(Character.class)) {
return Character.class;
}
return null;
}
}
class ByteClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
return Byte.class;
}
return null;
}
}
class BooleanClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
return Boolean.class;
}
return null;
}
}

View File

@ -24,28 +24,24 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
}
protected abstract T formatTrimmed(String raw) throws Exception;
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
public static final List<BasicClassDetector> basicClassDetector= Arrays.asList(new IntegerClassDetector(),
new LongClassDetector(),
new FloatClassDetector(),
new DoubleClassDetector(),
new ShortClassDetector(),
new ByteClassDetector(),
new BooleanClassDetector(),
new CharacterClassDetector());
public static Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
return Integer.class;
} else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
return Long.class;
} else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
return Double.class;
} else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
return Float.class;
} else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
return Short.class;
} else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
return Character.class;
} else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
return Byte.class;
} else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
return Boolean.class;
for (BasicClassDetector detector : basicClassDetector) {
Class<?> detectedClass = detector.detectBasicClass(type);
if (detectedClass != null) {
return detectedClass;
}
}
return type;
}
@ -146,5 +142,4 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
}
}
}