diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 9f9201e..11a671f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -9,11 +9,8 @@ import java.util.Date; import java.util.List; import java.util.UUID; import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; @@ -75,9 +72,9 @@ public class Spider implements Runnable, Task { protected Site site; protected String uuid; - - protected Scheduler scheduler = new QueueScheduler(); - + + protected SpiderScheduler scheduler; + protected Logger logger = LoggerFactory.getLogger(getClass()); protected CountableThreadPool threadPool; @@ -100,10 +97,6 @@ public class Spider implements Runnable, Task { protected boolean destroyWhenExit = true; - private ReentrantLock newUrlLock = new ReentrantLock(); - - private Condition newUrlCondition = newUrlLock.newCondition(); - private List spiderListeners; private final AtomicLong pageCount = new AtomicLong(0); @@ -131,6 +124,7 @@ public class Spider implements Runnable, Task { public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); + this.scheduler = new SpiderScheduler(new QueueScheduler()); } /** @@ -186,15 +180,15 @@ public class Spider implements Runnable, Task { /** * set scheduler for Spider * - * @param scheduler scheduler + * @param updateScheduler scheduler * @return this * @see Scheduler * @since 0.2.1 */ - public Spider setScheduler(Scheduler scheduler) { + public Spider setScheduler(Scheduler updateScheduler) { checkIfRunning(); - Scheduler oldScheduler = this.scheduler; - this.scheduler = scheduler; + SpiderScheduler oldScheduler = this.scheduler; + scheduler.setScheduler(updateScheduler); if (oldScheduler != null) { Request request; while ((request = oldScheduler.poll(this)) != null) { @@ -213,7 +207,7 @@ public class Spider implements Runnable, Task { * @deprecated */ @Deprecated - public Spider pipeline(Pipeline pipeline) { + public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); } @@ -264,7 +258,7 @@ public class Spider implements Runnable, Task { * @deprecated */ @Deprecated - public Spider downloader(Downloader downloader) { + public Spider downloader(Downloader downloader) { return setDownloader(downloader); } @@ -333,10 +327,10 @@ public class Spider implements Runnable, Task { } } else { // wait until new url added, - if (waitNewUrl()) { - //if interrupted + if (scheduler.waitNewUrl(threadPool, emptySleepTime)) { + // if interrupted break; - } + } continue; } } @@ -353,7 +347,7 @@ public class Spider implements Runnable, Task { logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); - signalNewUrl(); + scheduler.signalNewUrl(); } } }); @@ -536,7 +530,7 @@ public class Spider implements Runnable, Task { for (String url : urls) { addRequest(new Request(url)); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } @@ -588,42 +582,10 @@ public class Spider implements Runnable, Task { for (Request request : requests) { addRequest(request); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } - /** - * - * @return isInterrupted - */ - private boolean waitNewUrl() { - // now there may not be any thread live - newUrlLock.lock(); - try { - //double check,unnecessary, unless very fast concurrent - if (threadPool.getThreadAlive() == 0) { - return false; - } - //wait for amount of time - newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); - return false; - } catch (InterruptedException e) { - // logger.warn("waitNewUrl - interrupted, error {}", e); - return true; - } finally { - newUrlLock.unlock(); - } - } - - private void signalNewUrl() { - try { - newUrlLock.lock(); - newUrlCondition.signalAll(); - } finally { - newUrlLock.unlock(); - } - } - public void start() { runAsync(); } @@ -799,7 +761,7 @@ public class Spider implements Runnable, Task { } public Scheduler getScheduler() { - return scheduler; + return scheduler.getScheduler(); } /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java new file mode 100644 index 0000000..1005bac --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; + +import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.thread.CountableThreadPool; + +public class SpiderScheduler { + private Scheduler scheduler; + private final ReentrantLock newUrlLock = new ReentrantLock(); + private final Condition newUrlCondition = newUrlLock.newCondition(); + + public SpiderScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Scheduler getScheduler() { + return scheduler; + } + + public void setScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Request poll(Spider spider) { + return scheduler.poll(spider); + } + + public void push(Request request, Spider spider) { + scheduler.push(request, spider); + } + + public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) { + newUrlLock.lock(); + try { + if (threadPool.getThreadAlive() == 0) { + return false; + } + newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); + return false; + } catch (InterruptedException e) { + return true; + } finally { + newUrlLock.unlock(); + } + } + + public void signalNewUrl() { + try { + newUrlLock.lock(); + newUrlCondition.signalAll(); + } finally { + newUrlLock.unlock(); + } + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index c063b48..85ff5fa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -26,7 +26,6 @@ public class HtmlNode extends AbstractSelectable { return elements; } - @Override public Selectable smartContent() { SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, getSourceTexts()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index c78f679..18258e9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -42,11 +42,6 @@ public class PlainText extends AbstractSelectable { throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); } - @Override - public Selectable smartContent() { - throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); - } - @Override public Selectable links() { throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 9412cfc..a4d5fdb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -51,14 +51,6 @@ public interface Selectable { * @return new Selectable after extract */ public Selectable css(String selector, String attrName); - - /** - * select smart content with ReadAbility algorithm - * - * @return content - */ - public Selectable smartContent(); - /** * select all links * diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java new file mode 100644 index 0000000..f03b886 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java @@ -0,0 +1,85 @@ +package us.codecraft.webmagic.model.formatter; + +public interface BasicClassDetector { + Class detectBasicClass(Class type); +} + +class IntegerClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { + return Integer.class; + } + return null; + } +} + +class LongClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Long.TYPE) || type.equals(Long.class)) { + return Long.class; + } + return null; + } +} + +class DoubleClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Double.TYPE) || type.equals(Double.class)) { + return Double.class; + } + return null; + } +} + +class FloatClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Float.TYPE) || type.equals(Float.class)) { + return Float.class; + } + return null; + } +} + +class ShortClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Short.TYPE) || type.equals(Short.class)) { + return Short.class; + } + return null; + } +} + +class CharacterClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Character.TYPE) || type.equals(Character.class)) { + return Character.class; + } + return null; + } +} + +class ByteClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { + return Byte.class; + } + return null; + } +} + +class BooleanClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { + return Boolean.class; + } + return null; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java index f9d76a8..2d4d85b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -24,28 +24,24 @@ public abstract class BasicTypeFormatter implements ObjectFormatter { } protected abstract T formatTrimmed(String raw) throws Exception; - public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); + public static final List basicClassDetector= Arrays.asList(new IntegerClassDetector(), + new LongClassDetector(), + new FloatClassDetector(), + new DoubleClassDetector(), + new ShortClassDetector(), + new ByteClassDetector(), + new BooleanClassDetector(), + new CharacterClassDetector()); public static Class detectBasicClass(Class type) { - if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { - return Integer.class; - } else if (type.equals(Long.TYPE) || type.equals(Long.class)) { - return Long.class; - } else if (type.equals(Double.TYPE) || type.equals(Double.class)) { - return Double.class; - } else if (type.equals(Float.TYPE) || type.equals(Float.class)) { - return Float.class; - } else if (type.equals(Short.TYPE) || type.equals(Short.class)) { - return Short.class; - } else if (type.equals(Character.TYPE) || type.equals(Character.class)) { - return Character.class; - } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { - return Byte.class; - } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { - return Boolean.class; + for (BasicClassDetector detector : basicClassDetector) { + Class detectedClass = detector.detectBasicClass(type); + if (detectedClass != null) { + return detectedClass; + } } return type; } @@ -146,5 +142,4 @@ public abstract class BasicTypeFormatter implements ObjectFormatter { } } - }