update comments
parent
5073258237
commit
77e6ca2945
|
@ -6,10 +6,11 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 可抽取的纯文本,不包括xpath和css selector实现。<br>
|
* Selectable plain text.<br>
|
||||||
|
* Can not be selected by XPath or CSS Selector.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 上午7:54
|
|
||||||
*/
|
*/
|
||||||
public class PlainText implements Selectable {
|
public class PlainText implements Selectable {
|
||||||
|
|
||||||
|
@ -59,7 +60,7 @@ public class PlainText implements Selectable {
|
||||||
List<String> results = new ArrayList<String>();
|
List<String> results = new ArrayList<String>();
|
||||||
for (String string : strings) {
|
for (String string : strings) {
|
||||||
String result = selector.select(string);
|
String result = selector.select(string);
|
||||||
if (result!=null){
|
if (result != null) {
|
||||||
results.add(result);
|
results.add(result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
package us.codecraft.webmagic.utils;
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Stands for features unstable.
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Stands for features not stable.
|
|
||||||
*/
|
*/
|
||||||
public @interface Experimental {
|
public @interface Experimental {
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,11 +3,10 @@ package us.codecraft.webmagic.utils;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 文件持久化的基础类。<br>
|
* Base object of file persistence.
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-8-11 <br>
|
* @since 0.2.0
|
||||||
* Time: 下午4:21 <br>
|
|
||||||
*/
|
*/
|
||||||
public class FilePersistentBase {
|
public class FilePersistentBase {
|
||||||
|
|
||||||
|
|
|
@ -6,10 +6,8 @@ import java.util.concurrent.ThreadPoolExecutor;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 线程工具类。<br>
|
|
||||||
* @author code4crafer@gmail.com
|
* @author code4crafer@gmail.com
|
||||||
* Date: 13-6-23
|
* @since 0.1.0
|
||||||
* Time: 下午7:11
|
|
||||||
*/
|
*/
|
||||||
public class ThreadUtils {
|
public class ThreadUtils {
|
||||||
|
|
||||||
|
|
|
@ -6,20 +6,20 @@ import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* url及html处理工具类。<br>
|
* url and html utils.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* @since 0.1.0
|
||||||
* Time: 下午1:52
|
|
||||||
*/
|
*/
|
||||||
public class UrlUtils {
|
public class UrlUtils {
|
||||||
|
|
||||||
private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/");
|
private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 将url想对地址转化为绝对地址
|
* canonicalizeUrl
|
||||||
* @param url url地址
|
* @param url
|
||||||
* @param refer url地址来自哪个页面
|
* @param refer
|
||||||
* @return url绝对地址
|
* @return canonicalizeUrl
|
||||||
*/
|
*/
|
||||||
public static String canonicalizeUrl(String url, String refer) {
|
public static String canonicalizeUrl(String url, String refer) {
|
||||||
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
|
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
提供一些处理链接的静态工具类。
|
Static utils of webmagic.
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
Loading…
Reference in New Issue