Do not cache document in Selectable for selected Html element #73
parent
6201fd6966
commit
8b35d79569
|
@ -6,4 +6,26 @@ CREATE TABLE `DynamicClass` (
|
||||||
`UpdateTime` datetime NOT NULL,
|
`UpdateTime` datetime NOT NULL,
|
||||||
PRIMARY KEY (`Id`),
|
PRIMARY KEY (`Id`),
|
||||||
UNIQUE KEY `un_class_name` (`ClassName`)
|
UNIQUE KEY `un_class_name` (`ClassName`)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||||
|
|
||||||
|
CREATE TABLE `Spider` (
|
||||||
|
`Id` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||||
|
`PageProcessorId` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||||
|
`PipelineId` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||||
|
`SchedulerId` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||||
|
`Config` text NOT NULL,
|
||||||
|
`AddTime` datetime NOT NULL,
|
||||||
|
`UpdateTime` datetime NOT NULL,
|
||||||
|
PRIMARY KEY (`Id`),
|
||||||
|
UNIQUE KEY `un_class_name` (`ClassName`)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||||
|
|
||||||
|
CREATE TABLE `PageProcessor` (
|
||||||
|
`Id` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||||
|
`ClassName` varchar(200) NOT NULL,
|
||||||
|
`Params` text NOT NULL,
|
||||||
|
`AddTime` datetime NOT NULL,
|
||||||
|
`UpdateTime` datetime NOT NULL,
|
||||||
|
PRIMARY KEY (`Id`),
|
||||||
|
UNIQUE KEY `un_class_name` (`ClassName`)
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
|
@ -1,20 +0,0 @@
|
||||||
package us.codecraft.webmagic.avalon.web;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Controller;
|
|
||||||
import org.springframework.web.bind.annotation.RequestMapping;
|
|
||||||
import org.springframework.web.servlet.ModelAndView;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com
|
|
||||||
*/
|
|
||||||
@Controller("dashboard")
|
|
||||||
@RequestMapping("/")
|
|
||||||
public class DashBoardController {
|
|
||||||
|
|
||||||
@RequestMapping
|
|
||||||
public ModelAndView index() {
|
|
||||||
ModelAndView map = new ModelAndView("dashboard");
|
|
||||||
return map;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -8,6 +8,8 @@ import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Container of Spiders.
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
*/
|
*/
|
||||||
public class Worker {
|
public class Worker {
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
package us.codecraft.webmagic.avalon.web;
|
package us.codecraft.webmagic.worker.controller;
|
||||||
|
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.stereotype.Controller;
|
import org.springframework.stereotype.Controller;
|
||||||
import org.springframework.web.bind.annotation.RequestMapping;
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
import org.springframework.web.bind.annotation.ResponseBody;
|
import org.springframework.web.bind.annotation.ResponseBody;
|
||||||
|
import us.codecraft.webmagic.worker.Worker;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -10,15 +13,19 @@ import java.util.Map;
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
*/
|
*/
|
||||||
@Controller("spider")
|
@Controller
|
||||||
@RequestMapping("spider")
|
@RequestMapping("spider")
|
||||||
public class SpiderController {
|
public class SpiderController {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private Worker worker;
|
||||||
|
|
||||||
@RequestMapping("create")
|
@RequestMapping("create")
|
||||||
@ResponseBody
|
@ResponseBody
|
||||||
public Map<String, Object> create() {
|
public Map<String, Object> create(@RequestParam("id") String id) {
|
||||||
HashMap<String, Object> map = new HashMap<String, Object>();
|
HashMap<String, Object> map = new HashMap<String, Object>();
|
||||||
map.put("code", 200);
|
map.put("code", 200);
|
||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -23,7 +23,7 @@ public class Html extends PlainText {
|
||||||
*/
|
*/
|
||||||
private Document document;
|
private Document document;
|
||||||
|
|
||||||
private boolean init = false;
|
private boolean needInitCache = true;
|
||||||
|
|
||||||
public Html(List<String> strings) {
|
public Html(List<String> strings) {
|
||||||
super(strings);
|
super(strings);
|
||||||
|
@ -33,12 +33,22 @@ public class Html extends PlainText {
|
||||||
super(text);
|
super(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Html(List<String> strings, boolean needInitCache) {
|
||||||
|
super(strings);
|
||||||
|
this.needInitCache = needInitCache;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Html(String text, boolean needInitCache) {
|
||||||
|
super(text);
|
||||||
|
this.needInitCache = needInitCache;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* lazy init
|
* lazy init
|
||||||
*/
|
*/
|
||||||
private void initDocument() {
|
private void initDocument() {
|
||||||
if (this.document == null && !init) {
|
if (this.document == null && needInitCache) {
|
||||||
init = true;
|
needInitCache = false;
|
||||||
//just init once whether the parsing succeeds or not
|
//just init once whether the parsing succeeds or not
|
||||||
try {
|
try {
|
||||||
this.document = Jsoup.parse(getText());
|
this.document = Jsoup.parse(getText());
|
||||||
|
@ -67,7 +77,7 @@ public class Html extends PlainText {
|
||||||
results.add(result);
|
results.add(result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new Html(results);
|
return new Html(results, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -78,7 +88,7 @@ public class Html extends PlainText {
|
||||||
List<String> result = selector.selectList(string);
|
List<String> result = selector.selectList(string);
|
||||||
results.addAll(result);
|
results.addAll(result);
|
||||||
}
|
}
|
||||||
return new Html(results);
|
return new Html(results, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -95,9 +105,9 @@ public class Html extends PlainText {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable xpath(String xpath) {
|
public Selectable xpath(String xpath) {
|
||||||
XpathSelector xpathSelector = new XpathSelector(xpath);
|
XpathSelector xpathSelector = Selectors.xpath(xpath);
|
||||||
if (document != null) {
|
if (document != null) {
|
||||||
return new Html(xpathSelector.selectList(document));
|
return new Html(xpathSelector.selectList(document), false);
|
||||||
}
|
}
|
||||||
return selectList(xpathSelector, strings);
|
return selectList(xpathSelector, strings);
|
||||||
}
|
}
|
||||||
|
@ -106,7 +116,7 @@ public class Html extends PlainText {
|
||||||
public Selectable $(String selector) {
|
public Selectable $(String selector) {
|
||||||
CssSelector cssSelector = Selectors.$(selector);
|
CssSelector cssSelector = Selectors.$(selector);
|
||||||
if (document != null) {
|
if (document != null) {
|
||||||
return new Html(cssSelector.selectList(document));
|
return new Html(cssSelector.selectList(document), false);
|
||||||
}
|
}
|
||||||
return selectList(cssSelector, strings);
|
return selectList(cssSelector, strings);
|
||||||
}
|
}
|
||||||
|
@ -115,7 +125,7 @@ public class Html extends PlainText {
|
||||||
public Selectable $(String selector, String attrName) {
|
public Selectable $(String selector, String attrName) {
|
||||||
CssSelector cssSelector = Selectors.$(selector, attrName);
|
CssSelector cssSelector = Selectors.$(selector, attrName);
|
||||||
if (document != null) {
|
if (document != null) {
|
||||||
return new Html(cssSelector.selectList(document));
|
return new Html(cssSelector.selectList(document), false);
|
||||||
}
|
}
|
||||||
return selectList(cssSelector, strings);
|
return selectList(cssSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class SelectorTest {
|
||||||
|
|
||||||
|
private String html = "<div><a href='http://whatever.com/aaa'></a></div><div><a href='http://whatever.com/bbb'></a></div>";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testChain() throws Exception {
|
||||||
|
Html selectable = new Html(html);
|
||||||
|
List<String> linksWithoutChain = selectable.links().all();
|
||||||
|
Selectable xpath = selectable.xpath("//div");
|
||||||
|
List<String> linksWithChainFirstCall = xpath.links().all();
|
||||||
|
List<String> linksWithChainSecondCall = xpath.links().all();
|
||||||
|
assertThat(linksWithoutChain).hasSameSizeAs(linksWithChainFirstCall);
|
||||||
|
assertThat(linksWithChainFirstCall).hasSameSizeAs(linksWithChainSecondCall);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue