tags;
* }
-
+ *
* And start the spider by:
*
* OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
* ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
* }
-
+ *
+ *
* @author code4crafter@gmail.com
* @since 0.2.0
*/
@@ -49,6 +53,7 @@ public class OOSpider extends Spider {
/**
* create a spider
+ *
* @param site
* @param pageModelPipeline
* @param pageModels
@@ -57,7 +62,7 @@ public class OOSpider extends Spider {
this(ModelPageProcessor.create(site, pageModels));
this.modelPipeline = new ModelPipeline();
super.addPipeline(modelPipeline);
- if (pageModelPipeline!=null){
+ if (pageModelPipeline != null) {
for (Class pageModel : pageModels) {
this.modelPipeline.put(pageModel, pageModelPipeline);
}
@@ -72,6 +77,22 @@ public class OOSpider extends Spider {
return new OOSpider(site, pageModelPipeline, pageModels);
}
+ public static OOSpider direct(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
+ return new OOSpider(site, pageModelPipeline, pageModels);
+ }
+
+ public static OOSpider direct(PageModelPipeline pageModelPipeline, Class... pageModels) {
+ return new OOSpider(null, pageModelPipeline, pageModels);
+ }
+
+ public static OOSpider direct(Class... pageModels) {
+ return new OOSpider(null, null, pageModels);
+ }
+
+ public static OOSpider direct(Collection params,Class... pageModels) {
+ return new OOSpider(null, null, pageModels);
+ }
+
public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) {
for (Class pageModel : pageModels) {
modelPageProcessor.addPageModel(pageModel);
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java
new file mode 100644
index 0000000..a940a64
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/UrlTemplate.java
@@ -0,0 +1,37 @@
+package us.codecraft.webmagic.model.annotation;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.Target;
+
+/**
+ * Define the url patterns for class.
+ * All urls matching the pattern will be crawled and extracted for new objects.
+ *
+ * @author code4crafter@gmail.com
+ * @since 0.3.3
+ */
+@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
+@Target({ElementType.TYPE})
+public @interface UrlTemplate {
+
+ /**
+ * The url patterns for class.
+ * Use regex expression with some changes:
+ * "." stand for literal character "." instead of "any character".
+ * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
+ *
+ * @return the url patterns for class
+ */
+ String value();
+
+ /**
+ * Define the region for url extracting.
+ * Only support XPath.
+ * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
+ *
+ * @return the region for url extracting
+ */
+ String encoding() default "utf8";
+
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java
new file mode 100644
index 0000000..c66e854
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/direct/Param.java
@@ -0,0 +1,15 @@
+package us.codecraft.webmagic.model.direct;
+
+import java.util.LinkedHashMap;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class Param extends LinkedHashMap{
+
+ @Override
+ public Param put(String key, Object value) {
+ super.put(key, value);
+ return this;
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
index de3fdf5..b381c96 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
@@ -1,12 +1,12 @@
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
+import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
/**
* @author code4crafter@gmail.com
@@ -18,14 +18,26 @@ public class Kr36NewsModel {
@ExtractBy("//h1[@class='entry-title sep10']")
private String title;
- @ExtractBy("//div[@class='mainContent sep-10']")
+ @ExtractBy("//div[@class='mainContent sep-10']/tidyText()")
private String content;
@ExtractByUrl
private String url;
public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/"), new ConsolePageModelPipeline(),
- Kr36NewsModel.class).run();
+ OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0),new JsonFilePageModelPipeline(),
+ Kr36NewsModel.class).thread(20).run();
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getContent() {
+ return content;
+ }
+
+ public String getUrl() {
+ return url;
}
}