complete javadoc
parent
8b90b91e33
commit
7edfa26f90
|
@ -518,7 +518,7 @@ public class Spider implements Runnable, Task {
|
||||||
* Add urls with information to crawl.<br>
|
* Add urls with information to crawl.<br>
|
||||||
*
|
*
|
||||||
* @param requests requests
|
* @param requests requests
|
||||||
* @return
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Spider addRequest(Request... requests) {
|
public Spider addRequest(Request... requests) {
|
||||||
for (Request request : requests) {
|
for (Request request : requests) {
|
||||||
|
@ -730,7 +730,7 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set wait time when no url is polled.<br></br>
|
* Set wait time when no url is polled.<br><br>
|
||||||
*
|
*
|
||||||
* @param emptySleepTime In MILLISECONDS.
|
* @param emptySleepTime In MILLISECONDS.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -8,7 +8,7 @@ import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove duplicate urls and only push urls which are not duplicate.<br></br>
|
* Remove duplicate urls and only push urls which are not duplicate.<br><br>
|
||||||
*
|
*
|
||||||
* @author code4crafer@gmail.com
|
* @author code4crafer@gmail.com
|
||||||
* @since 0.5.0
|
* @since 0.5.0
|
||||||
|
|
|
@ -69,7 +69,7 @@ public class Html extends HtmlNode {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param selector selector
|
* @param selector selector
|
||||||
* @return
|
* @return result
|
||||||
*/
|
*/
|
||||||
public String selectDocument(Selector selector) {
|
public String selectDocument(Selector selector) {
|
||||||
if (selector instanceof ElementSelector) {
|
if (selector instanceof ElementSelector) {
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class HtmlNode extends AbstractSelectable {
|
||||||
* select elements
|
* select elements
|
||||||
*
|
*
|
||||||
* @param elementSelector elementSelector
|
* @param elementSelector elementSelector
|
||||||
* @return
|
* @return result
|
||||||
*/
|
*/
|
||||||
protected Selectable selectElements(BaseElementSelector elementSelector) {
|
protected Selectable selectElements(BaseElementSelector elementSelector) {
|
||||||
ListIterator<Element> elementIterator = getElements().listIterator();
|
ListIterator<Element> elementIterator = getElements().listIterator();
|
||||||
|
|
|
@ -35,6 +35,7 @@ public abstract class Selectors {
|
||||||
/**
|
/**
|
||||||
* @Deprecated
|
* @Deprecated
|
||||||
* @see #xpath(String)
|
* @see #xpath(String)
|
||||||
|
* @param expr expr
|
||||||
* @return new selector
|
* @return new selector
|
||||||
*/
|
*/
|
||||||
public static XpathSelector xsoup(String expr) {
|
public static XpathSelector xsoup(String expr) {
|
||||||
|
|
|
@ -7,10 +7,10 @@ import java.util.concurrent.locks.Condition;
|
||||||
import java.util.concurrent.locks.ReentrantLock;
|
import java.util.concurrent.locks.ReentrantLock;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Thread pool for workers.<br></br>
|
* Thread pool for workers.<br><br>
|
||||||
* Use {@link java.util.concurrent.ExecutorService} as inner implement. <br></br>
|
* Use {@link java.util.concurrent.ExecutorService} as inner implement. <br><br>
|
||||||
* New feature: <br></br>
|
* New feature: <br><br>
|
||||||
* 1. Block when thread pool is full to avoid poll many urls without process. <br></br>
|
* 1. Block when thread pool is full to avoid poll many urls without process. <br><br>
|
||||||
* 2. Count of thread alive for monitor.
|
* 2. Count of thread alive for monitor.
|
||||||
*
|
*
|
||||||
* @author code4crafer@gmail.com
|
* @author code4crafer@gmail.com
|
||||||
|
|
|
@ -52,7 +52,7 @@ public class UrlUtils {
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param url url
|
* @param url url
|
||||||
* @return
|
* @return new url
|
||||||
*/
|
*/
|
||||||
public static String encodeIllegalCharacterInUrl(String url) {
|
public static String encodeIllegalCharacterInUrl(String url) {
|
||||||
//TODO more charator support
|
//TODO more charator support
|
||||||
|
|
|
@ -9,7 +9,7 @@ import java.util.regex.Pattern;
|
||||||
* User: Sebastian MA
|
* User: Sebastian MA
|
||||||
* Date: April 03, 2014
|
* Date: April 03, 2014
|
||||||
* Time: 10:00
|
* Time: 10:00
|
||||||
* <p></p>
|
* <p>
|
||||||
* A PatternHandler is in charge of both page extraction and data processing by implementing
|
* A PatternHandler is in charge of both page extraction and data processing by implementing
|
||||||
* its two abstract methods.
|
* its two abstract methods.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -9,12 +9,12 @@ import us.codecraft.webmagic.Request;
|
||||||
public interface RequestMatcher {
|
public interface RequestMatcher {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check whether to process the page.<br></br>
|
* Check whether to process the page.<br><br>
|
||||||
* Please DO NOT change page status in this method.
|
* Please DO NOT change page status in this method.
|
||||||
*
|
*
|
||||||
* @param page page
|
* @param page page
|
||||||
*
|
*
|
||||||
* @return
|
* @return whether matches
|
||||||
*/
|
*/
|
||||||
public boolean match(Request page);
|
public boolean match(Request page);
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ public interface SubPipeline extends RequestMatcher {
|
||||||
/**
|
/**
|
||||||
* process the page, extract urls to fetch, extract the data and store
|
* process the page, extract urls to fetch, extract the data and store
|
||||||
*
|
*
|
||||||
* @param page page
|
* @param resultItems resultItems
|
||||||
* @param task task
|
* @param task task
|
||||||
* @return whether continue to match
|
* @return whether continue to match
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -25,7 +25,7 @@ import java.util.List;
|
||||||
* private String content;
|
* private String content;
|
||||||
*
|
*
|
||||||
* {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
* {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
|
||||||
* private List<String> tags;
|
* private List<String> tags;
|
||||||
* }
|
* }
|
||||||
* </pre>
|
* </pre>
|
||||||
* And start the spider by:
|
* And start the spider by:
|
||||||
|
|
|
@ -43,7 +43,7 @@ public class SpiderMonitor {
|
||||||
* Register spider for monitor.
|
* Register spider for monitor.
|
||||||
*
|
*
|
||||||
* @param spiders spiders
|
* @param spiders spiders
|
||||||
* @return
|
* @return this
|
||||||
*/
|
*/
|
||||||
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
|
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
|
||||||
for (Spider spider : spiders) {
|
for (Spider spider : spiders) {
|
||||||
|
|
|
@ -30,6 +30,7 @@ public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
|
||||||
/**
|
/**
|
||||||
* init map with protoMapClass
|
* init map with protoMapClass
|
||||||
*
|
*
|
||||||
|
* @param map the origin map to contains the DoubleKeyMap
|
||||||
* @param protoMapClass protoMapClass
|
* @param protoMapClass protoMapClass
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("rawtypes")
|
@SuppressWarnings("rawtypes")
|
||||||
|
|
Loading…
Reference in New Issue