Do not cache document in Selectable for selected Html element #73
parent
6201fd6966
commit
8b35d79569
|
@ -7,3 +7,25 @@ CREATE TABLE `DynamicClass` (
|
|||
PRIMARY KEY (`Id`),
|
||||
UNIQUE KEY `un_class_name` (`ClassName`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `Spider` (
|
||||
`Id` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`PageProcessorId` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`PipelineId` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`SchedulerId` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`Config` text NOT NULL,
|
||||
`AddTime` datetime NOT NULL,
|
||||
`UpdateTime` datetime NOT NULL,
|
||||
PRIMARY KEY (`Id`),
|
||||
UNIQUE KEY `un_class_name` (`ClassName`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `PageProcessor` (
|
||||
`Id` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`ClassName` varchar(200) NOT NULL,
|
||||
`Params` text NOT NULL,
|
||||
`AddTime` datetime NOT NULL,
|
||||
`UpdateTime` datetime NOT NULL,
|
||||
PRIMARY KEY (`Id`),
|
||||
UNIQUE KEY `un_class_name` (`ClassName`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
|
@ -1,20 +0,0 @@
|
|||
package us.codecraft.webmagic.avalon.web;
|
||||
|
||||
import org.springframework.stereotype.Controller;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.servlet.ModelAndView;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
@Controller("dashboard")
|
||||
@RequestMapping("/")
|
||||
public class DashBoardController {
|
||||
|
||||
@RequestMapping
|
||||
public ModelAndView index() {
|
||||
ModelAndView map = new ModelAndView("dashboard");
|
||||
return map;
|
||||
}
|
||||
|
||||
}
|
|
@ -8,6 +8,8 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
import java.util.concurrent.ExecutorService;
|
||||
|
||||
/**
|
||||
* Container of Spiders.
|
||||
*
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class Worker {
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
package us.codecraft.webmagic.avalon.web;
|
||||
package us.codecraft.webmagic.worker.controller;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Controller;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.ResponseBody;
|
||||
import us.codecraft.webmagic.worker.Worker;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
@ -10,15 +13,19 @@ import java.util.Map;
|
|||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
@Controller("spider")
|
||||
@Controller
|
||||
@RequestMapping("spider")
|
||||
public class SpiderController {
|
||||
|
||||
@Autowired
|
||||
private Worker worker;
|
||||
|
||||
@RequestMapping("create")
|
||||
@ResponseBody
|
||||
public Map<String, Object> create() {
|
||||
public Map<String, Object> create(@RequestParam("id") String id) {
|
||||
HashMap<String, Object> map = new HashMap<String, Object>();
|
||||
map.put("code", 200);
|
||||
return map;
|
||||
}
|
||||
|
||||
}
|
|
@ -23,7 +23,7 @@ public class Html extends PlainText {
|
|||
*/
|
||||
private Document document;
|
||||
|
||||
private boolean init = false;
|
||||
private boolean needInitCache = true;
|
||||
|
||||
public Html(List<String> strings) {
|
||||
super(strings);
|
||||
|
@ -33,12 +33,22 @@ public class Html extends PlainText {
|
|||
super(text);
|
||||
}
|
||||
|
||||
public Html(List<String> strings, boolean needInitCache) {
|
||||
super(strings);
|
||||
this.needInitCache = needInitCache;
|
||||
}
|
||||
|
||||
public Html(String text, boolean needInitCache) {
|
||||
super(text);
|
||||
this.needInitCache = needInitCache;
|
||||
}
|
||||
|
||||
/**
|
||||
* lazy init
|
||||
*/
|
||||
private void initDocument() {
|
||||
if (this.document == null && !init) {
|
||||
init = true;
|
||||
if (this.document == null && needInitCache) {
|
||||
needInitCache = false;
|
||||
//just init once whether the parsing succeeds or not
|
||||
try {
|
||||
this.document = Jsoup.parse(getText());
|
||||
|
@ -67,7 +77,7 @@ public class Html extends PlainText {
|
|||
results.add(result);
|
||||
}
|
||||
}
|
||||
return new Html(results);
|
||||
return new Html(results, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -78,7 +88,7 @@ public class Html extends PlainText {
|
|||
List<String> result = selector.selectList(string);
|
||||
results.addAll(result);
|
||||
}
|
||||
return new Html(results);
|
||||
return new Html(results, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -95,9 +105,9 @@ public class Html extends PlainText {
|
|||
|
||||
@Override
|
||||
public Selectable xpath(String xpath) {
|
||||
XpathSelector xpathSelector = new XpathSelector(xpath);
|
||||
XpathSelector xpathSelector = Selectors.xpath(xpath);
|
||||
if (document != null) {
|
||||
return new Html(xpathSelector.selectList(document));
|
||||
return new Html(xpathSelector.selectList(document), false);
|
||||
}
|
||||
return selectList(xpathSelector, strings);
|
||||
}
|
||||
|
@ -106,7 +116,7 @@ public class Html extends PlainText {
|
|||
public Selectable $(String selector) {
|
||||
CssSelector cssSelector = Selectors.$(selector);
|
||||
if (document != null) {
|
||||
return new Html(cssSelector.selectList(document));
|
||||
return new Html(cssSelector.selectList(document), false);
|
||||
}
|
||||
return selectList(cssSelector, strings);
|
||||
}
|
||||
|
@ -115,7 +125,7 @@ public class Html extends PlainText {
|
|||
public Selectable $(String selector, String attrName) {
|
||||
CssSelector cssSelector = Selectors.$(selector, attrName);
|
||||
if (document != null) {
|
||||
return new Html(cssSelector.selectList(document));
|
||||
return new Html(cssSelector.selectList(document), false);
|
||||
}
|
||||
return selectList(cssSelector, strings);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class SelectorTest {
|
||||
|
||||
private String html = "<div><a href='http://whatever.com/aaa'></a></div><div><a href='http://whatever.com/bbb'></a></div>";
|
||||
|
||||
@Test
|
||||
public void testChain() throws Exception {
|
||||
Html selectable = new Html(html);
|
||||
List<String> linksWithoutChain = selectable.links().all();
|
||||
Selectable xpath = selectable.xpath("//div");
|
||||
List<String> linksWithChainFirstCall = xpath.links().all();
|
||||
List<String> linksWithChainSecondCall = xpath.links().all();
|
||||
assertThat(linksWithoutChain).hasSameSizeAs(linksWithChainFirstCall);
|
||||
assertThat(linksWithChainFirstCall).hasSameSizeAs(linksWithChainSecondCall);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue