Refactored Code to increase maintainability (#1152)

* Initial Commit

* Assignment 1 Submission

* Resolving Implementation Smells

* Refactoring Code to increase maintainability
master
ayushi250317 2024-03-30 03:26:41 -03:00 committed by GitHub
parent 28ac8bf9c4
commit 9b9f173c1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 174 additions and 87 deletions

View File

@ -9,11 +9,8 @@ import java.util.Date;
import java.util.List; import java.util.List;
import java.util.UUID; import java.util.UUID;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -76,7 +73,7 @@ public class Spider implements Runnable, Task {
protected String uuid; protected String uuid;
protected Scheduler scheduler = new QueueScheduler(); protected SpiderScheduler scheduler;
protected Logger logger = LoggerFactory.getLogger(getClass()); protected Logger logger = LoggerFactory.getLogger(getClass());
@ -100,10 +97,6 @@ public class Spider implements Runnable, Task {
protected boolean destroyWhenExit = true; protected boolean destroyWhenExit = true;
private ReentrantLock newUrlLock = new ReentrantLock();
private Condition newUrlCondition = newUrlLock.newCondition();
private List<SpiderListener> spiderListeners; private List<SpiderListener> spiderListeners;
private final AtomicLong pageCount = new AtomicLong(0); private final AtomicLong pageCount = new AtomicLong(0);
@ -131,6 +124,7 @@ public class Spider implements Runnable, Task {
public Spider(PageProcessor pageProcessor) { public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor; this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite(); this.site = pageProcessor.getSite();
this.scheduler = new SpiderScheduler(new QueueScheduler());
} }
/** /**
@ -186,15 +180,15 @@ public class Spider implements Runnable, Task {
/** /**
* set scheduler for Spider * set scheduler for Spider
* *
* @param scheduler scheduler * @param updateScheduler scheduler
* @return this * @return this
* @see Scheduler * @see Scheduler
* @since 0.2.1 * @since 0.2.1
*/ */
public Spider setScheduler(Scheduler scheduler) { public Spider setScheduler(Scheduler updateScheduler) {
checkIfRunning(); checkIfRunning();
Scheduler oldScheduler = this.scheduler; SpiderScheduler oldScheduler = this.scheduler;
this.scheduler = scheduler; scheduler.setScheduler(updateScheduler);
if (oldScheduler != null) { if (oldScheduler != null) {
Request request; Request request;
while ((request = oldScheduler.poll(this)) != null) { while ((request = oldScheduler.poll(this)) != null) {
@ -213,7 +207,7 @@ public class Spider implements Runnable, Task {
* @deprecated * @deprecated
*/ */
@Deprecated @Deprecated
public Spider pipeline(Pipeline pipeline) { public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline); return addPipeline(pipeline);
} }
@ -264,7 +258,7 @@ public class Spider implements Runnable, Task {
* @deprecated * @deprecated
*/ */
@Deprecated @Deprecated
public Spider downloader(Downloader downloader) { public Spider downloader(Downloader downloader) {
return setDownloader(downloader); return setDownloader(downloader);
} }
@ -333,10 +327,10 @@ public class Spider implements Runnable, Task {
} }
} else { } else {
// wait until new url added // wait until new url added
if (waitNewUrl()) { if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
//if interrupted // if interrupted
break; break;
} }
continue; continue;
} }
} }
@ -353,7 +347,7 @@ public class Spider implements Runnable, Task {
logger.error("process request " + request + " error", e); logger.error("process request " + request + " error", e);
} finally { } finally {
pageCount.incrementAndGet(); pageCount.incrementAndGet();
signalNewUrl(); scheduler.signalNewUrl();
} }
} }
}); });
@ -536,7 +530,7 @@ public class Spider implements Runnable, Task {
for (String url : urls) { for (String url : urls) {
addRequest(new Request(url)); addRequest(new Request(url));
} }
signalNewUrl(); scheduler.signalNewUrl();
return this; return this;
} }
@ -588,42 +582,10 @@ public class Spider implements Runnable, Task {
for (Request request : requests) { for (Request request : requests) {
addRequest(request); addRequest(request);
} }
signalNewUrl(); scheduler.signalNewUrl();
return this; return this;
} }
/**
*
* @return isInterrupted
*/
private boolean waitNewUrl() {
// now there may not be any thread live
newUrlLock.lock();
try {
//double checkunnecessary, unless very fast concurrent
if (threadPool.getThreadAlive() == 0) {
return false;
}
//wait for amount of time
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
// logger.warn("waitNewUrl - interrupted, error {}", e);
return true;
} finally {
newUrlLock.unlock();
}
}
private void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
public void start() { public void start() {
runAsync(); runAsync();
} }
@ -799,7 +761,7 @@ public class Spider implements Runnable, Task {
} }
public Scheduler getScheduler() { public Scheduler getScheduler() {
return scheduler; return scheduler.getScheduler();
} }
/** /**

View File

@ -0,0 +1,59 @@
package us.codecraft.webmagic;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
public class SpiderScheduler {
private Scheduler scheduler;
private final ReentrantLock newUrlLock = new ReentrantLock();
private final Condition newUrlCondition = newUrlLock.newCondition();
public SpiderScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Scheduler getScheduler() {
return scheduler;
}
public void setScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Request poll(Spider spider) {
return scheduler.poll(spider);
}
public void push(Request request, Spider spider) {
scheduler.push(request, spider);
}
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
newUrlLock.lock();
try {
if (threadPool.getThreadAlive() == 0) {
return false;
}
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
return true;
} finally {
newUrlLock.unlock();
}
}
public void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
}

View File

@ -26,7 +26,6 @@ public class HtmlNode extends AbstractSelectable {
return elements; return elements;
} }
@Override
public Selectable smartContent() { public Selectable smartContent() {
SmartContentSelector smartContentSelector = Selectors.smartContent(); SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, getSourceTexts()); return select(smartContentSelector, getSourceTexts());

View File

@ -42,11 +42,6 @@ public class PlainText extends AbstractSelectable {
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
} }
@Override
public Selectable smartContent() {
throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}
@Override @Override
public Selectable links() { public Selectable links() {
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");

View File

@ -51,14 +51,6 @@ public interface Selectable {
* @return new Selectable after extract * @return new Selectable after extract
*/ */
public Selectable css(String selector, String attrName); public Selectable css(String selector, String attrName);
/**
* select smart content with ReadAbility algorithm
*
* @return content
*/
public Selectable smartContent();
/** /**
* select all links * select all links
* *

View File

@ -0,0 +1,85 @@
package us.codecraft.webmagic.model.formatter;
public interface BasicClassDetector {
Class<?> detectBasicClass(Class<?> type);
}
class IntegerClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
return Integer.class;
}
return null;
}
}
class LongClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Long.TYPE) || type.equals(Long.class)) {
return Long.class;
}
return null;
}
}
class DoubleClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Double.TYPE) || type.equals(Double.class)) {
return Double.class;
}
return null;
}
}
class FloatClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Float.TYPE) || type.equals(Float.class)) {
return Float.class;
}
return null;
}
}
class ShortClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Short.TYPE) || type.equals(Short.class)) {
return Short.class;
}
return null;
}
}
class CharacterClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Character.TYPE) || type.equals(Character.class)) {
return Character.class;
}
return null;
}
}
class ByteClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
return Byte.class;
}
return null;
}
}
class BooleanClassDetector implements BasicClassDetector {
@Override
public Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
return Boolean.class;
}
return null;
}
}

View File

@ -24,28 +24,24 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
} }
protected abstract T formatTrimmed(String raw) throws Exception; protected abstract T formatTrimmed(String raw) throws Exception;
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class, public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
public static final List<BasicClassDetector> basicClassDetector= Arrays.asList(new IntegerClassDetector(),
new LongClassDetector(),
new FloatClassDetector(),
new DoubleClassDetector(),
new ShortClassDetector(),
new ByteClassDetector(),
new BooleanClassDetector(),
new CharacterClassDetector());
public static Class<?> detectBasicClass(Class<?> type) { public static Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { for (BasicClassDetector detector : basicClassDetector) {
return Integer.class; Class<?> detectedClass = detector.detectBasicClass(type);
} else if (type.equals(Long.TYPE) || type.equals(Long.class)) { if (detectedClass != null) {
return Long.class; return detectedClass;
} else if (type.equals(Double.TYPE) || type.equals(Double.class)) { }
return Double.class;
} else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
return Float.class;
} else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
return Short.class;
} else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
return Character.class;
} else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
return Byte.class;
} else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
return Boolean.class;
} }
return type; return type;
} }
@ -146,5 +142,4 @@ public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
} }
} }
} }