Clean project structure #70
parent
9606a173cd
commit
6c11718566
22
pom.xml
22
pom.xml
|
@ -6,7 +6,7 @@
|
||||||
<version>7</version>
|
<version>7</version>
|
||||||
</parent>
|
</parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.4.4-SNAPSHOT</version>
|
<version>0.5.0-SNAPSHOT</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
@ -51,11 +51,11 @@
|
||||||
<module>webmagic-core</module>
|
<module>webmagic-core</module>
|
||||||
<module>webmagic-extension/</module>
|
<module>webmagic-extension/</module>
|
||||||
<module>webmagic-scripts/</module>
|
<module>webmagic-scripts/</module>
|
||||||
<module>webmagic-avalon</module>
|
|
||||||
<module>webmagic-lucene</module>
|
|
||||||
<module>webmagic-samples</module>
|
|
||||||
<module>webmagic-saxon</module>
|
|
||||||
<module>webmagic-selenium</module>
|
<module>webmagic-selenium</module>
|
||||||
|
<module>webmagic-saxon</module>
|
||||||
|
<module>webmagic-samples</module>
|
||||||
|
<module>webmagic-admin</module>
|
||||||
|
<module>webmagic-worker</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<dependencyManagement>
|
<dependencyManagement>
|
||||||
|
@ -63,7 +63,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>junit</groupId>
|
||||||
<artifactId>junit</artifactId>
|
<artifactId>junit</artifactId>
|
||||||
<version>4.7</version>
|
<version>4.11</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -91,11 +91,6 @@
|
||||||
<artifactId>xsoup</artifactId>
|
<artifactId>xsoup</artifactId>
|
||||||
<version>0.2.0</version>
|
<version>0.2.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>net.sf.saxon</groupId>
|
|
||||||
<artifactId>Saxon-HE</artifactId>
|
|
||||||
<version>9.5.1-1</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.alibaba</groupId>
|
<groupId>com.alibaba</groupId>
|
||||||
<artifactId>fastjson</artifactId>
|
<artifactId>fastjson</artifactId>
|
||||||
|
@ -121,11 +116,6 @@
|
||||||
<artifactId>commons-collections</artifactId>
|
<artifactId>commons-collections</artifactId>
|
||||||
<version>3.2.1</version>
|
<version>3.2.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>net.sourceforge.htmlcleaner</groupId>
|
|
||||||
<artifactId>htmlcleaner</artifactId>
|
|
||||||
<version>2.5</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-io</artifactId>
|
<artifactId>commons-io</artifactId>
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
WebMagic-Admin
|
||||||
|
=====
|
||||||
|
Admin is the control web of workers.
|
|
@ -3,12 +3,12 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.4.4-SNAPSHOT</version>
|
<version>0.5.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-avalon</artifactId>
|
<artifactId>webmagic-admin</artifactId>
|
||||||
<packaging>war</packaging>
|
<packaging>war</packaging>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
Before Width: | Height: | Size: 8.5 KiB After Width: | Height: | Size: 8.5 KiB |
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.4.4-SNAPSHOT</version>
|
<version>0.5.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -50,11 +50,6 @@
|
||||||
<artifactId>commons-collections</artifactId>
|
<artifactId>commons-collections</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>net.sourceforge.htmlcleaner</groupId>
|
|
||||||
<artifactId>htmlcleaner</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.assertj</groupId>
|
<groupId>org.assertj</groupId>
|
||||||
<artifactId>assertj-core</artifactId>
|
<artifactId>assertj-core</artifactId>
|
||||||
|
|
|
@ -13,7 +13,6 @@ import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||||
import us.codecraft.webmagic.scheduler.Scheduler;
|
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||||
import us.codecraft.webmagic.utils.EnvironmentUtil;
|
|
||||||
import us.codecraft.webmagic.utils.ThreadUtils;
|
import us.codecraft.webmagic.utils.ThreadUtils;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
|
@ -541,15 +540,6 @@ public class Spider implements Runnable, Task {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* switch off xsoup
|
|
||||||
*
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public static void xsoupOff() {
|
|
||||||
EnvironmentUtil.setUseXsoup(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isExitWhenComplete() {
|
public boolean isExitWhenComplete() {
|
||||||
return exitWhenComplete;
|
return exitWhenComplete;
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,6 @@ import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import us.codecraft.webmagic.utils.EnvironmentUtil;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -96,16 +95,11 @@ public class Html extends PlainText {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable xpath(String xpath) {
|
public Selectable xpath(String xpath) {
|
||||||
if (EnvironmentUtil.useXsoup()) {
|
|
||||||
XsoupSelector xsoupSelector = new XsoupSelector(xpath);
|
|
||||||
if (document != null) {
|
|
||||||
return new Html(xsoupSelector.selectList(document));
|
|
||||||
}
|
|
||||||
return selectList(xsoupSelector, strings);
|
|
||||||
} else {
|
|
||||||
XpathSelector xpathSelector = new XpathSelector(xpath);
|
XpathSelector xpathSelector = new XpathSelector(xpath);
|
||||||
return selectList(xpathSelector, strings);
|
if (document != null) {
|
||||||
|
return new Html(xpathSelector.selectList(document));
|
||||||
}
|
}
|
||||||
|
return selectList(xpathSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -32,8 +32,12 @@ public abstract class Selectors {
|
||||||
return new XpathSelector(expr);
|
return new XpathSelector(expr);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static XsoupSelector xsoup(String expr) {
|
/**
|
||||||
return new XsoupSelector(expr);
|
* @Deprecated
|
||||||
|
* @see #xpath(String)
|
||||||
|
*/
|
||||||
|
public static XpathSelector xsoup(String expr) {
|
||||||
|
return new XpathSelector(expr);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static AndSelector and(Selector... selectors) {
|
public static AndSelector and(Selector... selectors) {
|
||||||
|
|
|
@ -1,70 +1,32 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.htmlcleaner.*;
|
import org.jsoup.nodes.Element;
|
||||||
|
import us.codecraft.xsoup.XPathEvaluator;
|
||||||
|
import us.codecraft.xsoup.Xsoup;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* XPath selector based on HtmlCleaner.<br>
|
* XPath selector based on Xsoup.<br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.3.0
|
||||||
*/
|
*/
|
||||||
public class XpathSelector implements Selector {
|
public class XpathSelector extends BaseElementSelector {
|
||||||
|
|
||||||
private String xpathStr;
|
private XPathEvaluator xPathEvaluator;
|
||||||
|
|
||||||
public XpathSelector(String xpathStr) {
|
public XpathSelector(String xpathStr) {
|
||||||
this.xpathStr = xpathStr;
|
this.xPathEvaluator = Xsoup.compile(xpathStr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String select(String text) {
|
public String select(Element element) {
|
||||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
return xPathEvaluator.evaluate(element).get();
|
||||||
TagNode tagNode = htmlCleaner.clean(text);
|
|
||||||
if (tagNode == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
Object[] objects = tagNode.evaluateXPath(xpathStr);
|
|
||||||
if (objects != null && objects.length >= 1) {
|
|
||||||
if (objects[0] instanceof TagNode) {
|
|
||||||
TagNode tagNode1 = (TagNode) objects[0];
|
|
||||||
return htmlCleaner.getInnerHtml(tagNode1);
|
|
||||||
} else {
|
|
||||||
return objects[0].toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (XPatherException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(String text) {
|
public List<String> selectList(Element element) {
|
||||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
return xPathEvaluator.evaluate(element).list();
|
||||||
TagNode tagNode = htmlCleaner.clean(text);
|
|
||||||
if (tagNode == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
List<String> results = new ArrayList<String>();
|
|
||||||
try {
|
|
||||||
Object[] objects = tagNode.evaluateXPath(xpathStr);
|
|
||||||
if (objects != null && objects.length >= 1) {
|
|
||||||
for (Object object : objects) {
|
|
||||||
if (object instanceof TagNode) {
|
|
||||||
TagNode tagNode1 = (TagNode) object;
|
|
||||||
results.add(htmlCleaner.getInnerHtml(tagNode1));
|
|
||||||
} else {
|
|
||||||
results.add(object.toString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (XPatherException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,32 +0,0 @@
|
||||||
package us.codecraft.webmagic.selector;
|
|
||||||
|
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
import us.codecraft.xsoup.XPathEvaluator;
|
|
||||||
import us.codecraft.xsoup.Xsoup;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* XPath selector based on Xsoup.<br>
|
|
||||||
*
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* @since 0.3.0
|
|
||||||
*/
|
|
||||||
public class XsoupSelector extends BaseElementSelector {
|
|
||||||
|
|
||||||
private XPathEvaluator xPathEvaluator;
|
|
||||||
|
|
||||||
public XsoupSelector(String xpathStr) {
|
|
||||||
this.xPathEvaluator = Xsoup.compile(xpathStr);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String select(Element element) {
|
|
||||||
return xPathEvaluator.evaluate(element).get();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<String> selectList(Element element) {
|
|
||||||
return xPathEvaluator.evaluate(element).list();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,28 +0,0 @@
|
||||||
package us.codecraft.webmagic.utils;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.BooleanUtils;
|
|
||||||
|
|
||||||
import java.util.Properties;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com
|
|
||||||
* @since 0.3.0
|
|
||||||
*/
|
|
||||||
public abstract class EnvironmentUtil {
|
|
||||||
|
|
||||||
private static final String USE_XSOUP = "xsoup";
|
|
||||||
|
|
||||||
public static boolean useXsoup() {
|
|
||||||
Properties properties = System.getProperties();
|
|
||||||
Object o = properties.get(USE_XSOUP);
|
|
||||||
if (o == null) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return BooleanUtils.toBoolean(((String) o).toLowerCase());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void setUseXsoup(boolean useXsoup) {
|
|
||||||
Properties properties = System.getProperties();
|
|
||||||
properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false"));
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -29,6 +29,6 @@ public class ExtractorsTest {
|
||||||
Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2));
|
Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2));
|
||||||
OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title"));
|
OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title"));
|
||||||
Assert.assertEquals("aabbcc", or.select(html));
|
Assert.assertEquals("aabbcc", or.select(html));
|
||||||
Assert.assertEquals("aabbcc", or.select(html2));
|
Assert.assertEquals("<title>aabbcc</title>", or.select(html2));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
package us.codecraft.webmagic.utils;
|
|
||||||
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import static junit.framework.Assert.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com
|
|
||||||
*/
|
|
||||||
public class EnvironmentUtilTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test() {
|
|
||||||
assertTrue(EnvironmentUtil.useXsoup());
|
|
||||||
EnvironmentUtil.setUseXsoup(false);
|
|
||||||
assertFalse(EnvironmentUtil.useXsoup());
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.4.4-SNAPSHOT</version>
|
<version>0.5.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -37,12 +37,7 @@ public class ExtractorUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Selector getXpathSelector(String value) {
|
private static Selector getXpathSelector(String value) {
|
||||||
Selector selector;
|
Selector selector = new XpathSelector(value);
|
||||||
if (EnvironmentUtil.useXsoup()) {
|
|
||||||
selector = new XsoupSelector(value);
|
|
||||||
} else {
|
|
||||||
selector = new XpathSelector(value);
|
|
||||||
}
|
|
||||||
return selector;
|
return selector;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
webmagic-lucene
|
|
||||||
--------
|
|
||||||
尝试将webmagic与lucene结合,打造一个搜索引擎。开发中,不作为webmagic主要模块。
|
|
|
@ -1,46 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<parent>
|
|
||||||
<artifactId>webmagic-parent</artifactId>
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<version>0.4.4-SNAPSHOT</version>
|
|
||||||
</parent>
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
<artifactId>webmagic-lucene</artifactId>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.lucene</groupId>
|
|
||||||
<artifactId>lucene-analyzers-common</artifactId>
|
|
||||||
<version>4.4.0</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.lucene</groupId>
|
|
||||||
<artifactId>lucene-queryparser</artifactId>
|
|
||||||
<version>4.4.0</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<artifactId>webmagic-extension</artifactId>
|
|
||||||
<version>${project.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
|
||||||
|
|
||||||
<build>
|
|
||||||
<plugins>
|
|
||||||
<plugin>
|
|
||||||
<artifactId>maven-deploy-plugin</artifactId>
|
|
||||||
<configuration>
|
|
||||||
<skip>true</skip>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
|
||||||
</build>
|
|
||||||
|
|
||||||
|
|
||||||
</project>
|
|
|
@ -1,92 +0,0 @@
|
||||||
package us.codecraft.webmagic.pipeline;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.document.Field;
|
|
||||||
import org.apache.lucene.document.TextField;
|
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
|
||||||
import org.apache.lucene.index.IndexWriter;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
|
||||||
import org.apache.lucene.queryparser.classic.ParseException;
|
|
||||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
import us.codecraft.webmagic.ResultItems;
|
|
||||||
import us.codecraft.webmagic.Task;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* Date: 13-8-5 <br>
|
|
||||||
* Time: 下午2:11 <br>
|
|
||||||
*/
|
|
||||||
public class LucenePipeline implements Pipeline {
|
|
||||||
|
|
||||||
private Directory directory;
|
|
||||||
|
|
||||||
private Analyzer analyzer;
|
|
||||||
|
|
||||||
private IndexWriterConfig config;
|
|
||||||
|
|
||||||
private void init() throws IOException {
|
|
||||||
analyzer = new StandardAnalyzer(Version.LUCENE_44);
|
|
||||||
directory = new RAMDirectory();
|
|
||||||
config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
|
|
||||||
}
|
|
||||||
|
|
||||||
public LucenePipeline() {
|
|
||||||
try {
|
|
||||||
init();
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Document> search(String fieldName, String value) throws IOException, ParseException {
|
|
||||||
List<Document> documents = new ArrayList<Document>();
|
|
||||||
DirectoryReader ireader = DirectoryReader.open(directory);
|
|
||||||
IndexSearcher isearcher = new IndexSearcher(ireader);
|
|
||||||
// Parse a simple query that searches for "text":
|
|
||||||
QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer);
|
|
||||||
Query query = parser.parse(value);
|
|
||||||
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
|
|
||||||
// Iterate through the results:
|
|
||||||
for (int i = 0; i < hits.length; i++) {
|
|
||||||
Document hitDoc = isearcher.doc(hits[i].doc);
|
|
||||||
documents.add(hitDoc);
|
|
||||||
}
|
|
||||||
ireader.close();
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(ResultItems resultItems, Task task) {
|
|
||||||
if (resultItems.isSkip()){
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
Document doc = new Document();
|
|
||||||
Map<String,Object> all = resultItems.getAll();
|
|
||||||
if (all==null){
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
|
|
||||||
doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
IndexWriter indexWriter = new IndexWriter(directory, config);
|
|
||||||
indexWriter.addDocument(doc);
|
|
||||||
indexWriter.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,61 +0,0 @@
|
||||||
package us.codecraft.webmagic.lucene;
|
|
||||||
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.queryparser.classic.ParseException;
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
|
||||||
import us.codecraft.webmagic.model.OOSpider;
|
|
||||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
|
||||||
import us.codecraft.webmagic.pipeline.LucenePipeline;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
* Date: 13-8-2 <br>
|
|
||||||
* Time: 上午7:52 <br>
|
|
||||||
*/
|
|
||||||
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
|
||||||
public class OschinaBlog {
|
|
||||||
|
|
||||||
@ExtractBy("//title")
|
|
||||||
private String title;
|
|
||||||
|
|
||||||
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
|
|
||||||
private String content;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return "OschinaBlog{" +
|
|
||||||
"title='" + title + '\'' +
|
|
||||||
", content='" + content + '\'' +
|
|
||||||
'}';
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) {
|
|
||||||
LucenePipeline pipeline = new LucenePipeline();
|
|
||||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync();
|
|
||||||
while (true) {
|
|
||||||
try {
|
|
||||||
List<Document> search = pipeline.search("title", "webmagic");
|
|
||||||
System.out.println(search);
|
|
||||||
Thread.sleep(3000);
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} catch (ParseException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTitle() {
|
|
||||||
return title;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getContent() {
|
|
||||||
return content;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,20 +0,0 @@
|
||||||
Worker:
|
|
||||||
|
|
||||||
任务执行者,提供Http接口,监控运行状态,终止和开始job
|
|
||||||
|
|
||||||
队列:
|
|
||||||
|
|
||||||
仍然使用redis
|
|
||||||
|
|
||||||
Panel:
|
|
||||||
|
|
||||||
提供Web管理后台,管理
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
1. 新建任务
|
|
||||||
1. 通过脚本
|
|
||||||
2. 配置
|
|
||||||
3. 分配机器
|
|
||||||
2. 已有任务
|
|
||||||
3. 任务查看
|
|
|
@ -1,35 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<parent>
|
|
||||||
<artifactId>webmagic-parent</artifactId>
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<version>0.4.3-SNAPSHOT</version>
|
|
||||||
</parent>
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<artifactId>webmagic-panel</artifactId>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>us.codecraft</groupId>
|
|
||||||
<artifactId>webmagic-scripts</artifactId>
|
|
||||||
<version>${project.version}</version>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
|
||||||
|
|
||||||
<build>
|
|
||||||
<plugins>
|
|
||||||
<plugin>
|
|
||||||
<artifactId>maven-deploy-plugin</artifactId>
|
|
||||||
<configuration>
|
|
||||||
<skip>true</skip>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
|
||||||
</build>
|
|
||||||
|
|
||||||
|
|
||||||
</project>
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.4.4-SNAPSHOT</version>
|
<version>0.5.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.4.4-SNAPSHOT</version>
|
<version>0.5.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -15,9 +15,15 @@
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||||
|
<artifactId>htmlcleaner</artifactId>
|
||||||
|
<version>2.5</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>net.sf.saxon</groupId>
|
<groupId>net.sf.saxon</groupId>
|
||||||
<artifactId>Saxon-HE</artifactId>
|
<artifactId>Saxon-HE</artifactId>
|
||||||
|
<version>9.5.1-1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>junit</groupId>
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue