invite jsoup and cssselector

master
yihua.huang 2013-07-20 08:34:18 +08:00
parent c733046045
commit 81e7f7982e
8 changed files with 76 additions and 1 deletions

View File

@ -52,6 +52,12 @@
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>

View File

@ -154,9 +154,11 @@ public class Spider implements Runnable, Task {
request = scheduler.poll(this);
}
} else {
//multi thread
final AtomicInteger threadAlive = new AtomicInteger(0);
while (true) {
if (request == null) {
//when no request found but some thread is alive, sleep a while.
try {
Thread.sleep(100);
} catch (InterruptedException e) {

View File

@ -0,0 +1,47 @@
package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 9:39
*/
public class CssSelector implements Selector {
private String selectorText;
public CssSelector(String selectorText) {
this.selectorText = selectorText;
}
@Override
public String select(String text) {
Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) {
return null;
}
return elements.get(0).outerHtml();
}
@Override
public List<String> selectList(String text) {
List<String> strings = new ArrayList<String>();
Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) {
strings.add(element.outerHtml());
}
}
return strings;
}
}

View File

@ -62,4 +62,10 @@ public class Html extends PlainText {
return selectList(xpathSelector, strings);
}
@Override
public Selectable $(String selector) {
CssSelector cssSelector = new CssSelector(selector);
return selectList(cssSelector,strings);
}
}

View File

@ -33,6 +33,11 @@ public class PlainText implements Selectable {
throw new UnsupportedOperationException();
}
@Override
public Selectable $(String selector) {
throw new UnsupportedOperationException();
}
@Override
public Selectable smartContent() {
throw new UnsupportedOperationException();

View File

@ -17,6 +17,14 @@ public interface Selectable {
*/
public Selectable xpath(String xpath);
/**
* select list with jquery selector
*
* @param
* @return
*/
public Selectable $(String selector);
/**
* select smart content with ReadAbility algorithm
*

View File

@ -17,4 +17,5 @@ public class HtmlTest {
Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()));
}
}

View File

@ -1351,7 +1351,7 @@ public class XpathSelectorTest {
public void testOschina() {
Html html1 = new Html(html);
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
System.out.println(html1.regex("(<body>.*?</body>)").links().toStrings());
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings());
}
}