update lucene pipeline
parent
29f8cd2ec6
commit
7d277e84d4
|
@ -228,9 +228,11 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
pageProcessor.process(page);
|
pageProcessor.process(page);
|
||||||
addRequest(page);
|
addRequest(page);
|
||||||
|
if (!page.getResultItems().isSkip()){
|
||||||
for (Pipeline pipeline : pipelines) {
|
for (Pipeline pipeline : pipelines) {
|
||||||
pipeline.process(page.getResultItems(), this);
|
pipeline.process(page.getResultItems(), this);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
sleep(site.getSleepTime());
|
sleep(site.getSleepTime());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,9 +15,6 @@ public class ConsolePipeline implements Pipeline{
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(ResultItems resultItems,Task task) {
|
public void process(ResultItems resultItems,Task task) {
|
||||||
if (resultItems.isSkip()){
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
||||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||||
if (entry.getValue() instanceof Iterable) {
|
if (entry.getValue() instanceof Iterable) {
|
||||||
|
|
|
@ -47,9 +47,6 @@ public class FilePipeline implements Pipeline {
|
||||||
if (!file.exists()) {
|
if (!file.exists()) {
|
||||||
file.mkdirs();
|
file.mkdirs();
|
||||||
}
|
}
|
||||||
if (resultItems.isSkip()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
try {
|
try {
|
||||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
|
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
|
||||||
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||||
|
|
|
@ -3,20 +3,26 @@ package us.codecraft.webmagic.pipeline;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.queryparser.classic.ParseException;
|
||||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
import us.codecraft.webmagic.ResultItems;
|
import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author yihua.huang@dianping.com <br>
|
* @author yihua.huang@dianping.com <br>
|
||||||
|
@ -24,41 +30,64 @@ import java.io.File;
|
||||||
* Time: 下午2:11 <br>
|
* Time: 下午2:11 <br>
|
||||||
*/
|
*/
|
||||||
public class LucenePipeline implements Pipeline {
|
public class LucenePipeline implements Pipeline {
|
||||||
@Override
|
|
||||||
public void process(ResultItems resultItems, Task task) {
|
|
||||||
try {
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
private Directory directory;
|
||||||
|
|
||||||
}
|
private IndexWriter indexWriter;
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
private Analyzer analyzer;
|
||||||
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
|
|
||||||
// Directory directory = new RAMDirectory();
|
private void init() throws IOException {
|
||||||
// To store an index on disk, use this instead:
|
analyzer = new StandardAnalyzer(Version.LUCENE_44);
|
||||||
Directory directory = FSDirectory.open(new File("/data/webmagic/www.guoxue123.cn/"));
|
directory = new RAMDirectory();
|
||||||
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
|
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
|
||||||
IndexWriter iwriter = new IndexWriter(directory, config);
|
indexWriter = new IndexWriter(directory, config);
|
||||||
Document doc = new Document();
|
indexWriter.close();
|
||||||
// String text = "This is the text to be indexed.";
|
}
|
||||||
// doc.add(new Field("fieldname", text, TextField.TYPE_STORED));
|
|
||||||
// iwriter.addDocument(doc);
|
|
||||||
iwriter.close();
|
|
||||||
|
|
||||||
// Now search the index:
|
public LucenePipeline() {
|
||||||
|
try {
|
||||||
|
init();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Document> search(String fieldName, String value) throws IOException, ParseException {
|
||||||
|
List<Document> documents = new ArrayList<Document>();
|
||||||
DirectoryReader ireader = DirectoryReader.open(directory);
|
DirectoryReader ireader = DirectoryReader.open(directory);
|
||||||
IndexSearcher isearcher = new IndexSearcher(ireader);
|
IndexSearcher isearcher = new IndexSearcher(ireader);
|
||||||
// Parse a simple query that searches for "text":
|
// Parse a simple query that searches for "text":
|
||||||
QueryParser parser = new QueryParser(Version.LUCENE_44, "fieldname", analyzer);
|
QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer);
|
||||||
Query query = parser.parse("经典");
|
Query query = parser.parse(value);
|
||||||
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
|
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
|
||||||
// Iterate through the results:
|
// Iterate through the results:
|
||||||
for (int i = 0; i < hits.length; i++) {
|
for (int i = 0; i < hits.length; i++) {
|
||||||
Document hitDoc = isearcher.doc(hits[i].doc);
|
Document hitDoc = isearcher.doc(hits[i].doc);
|
||||||
System.out.println(hitDoc);
|
documents.add(hitDoc);
|
||||||
}
|
}
|
||||||
ireader.close();
|
ireader.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(ResultItems resultItems, Task task) {
|
||||||
|
if (resultItems.isSkip()){
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Document doc = new Document();
|
||||||
|
Map<String,Object> all = resultItems.getAll();
|
||||||
|
if (all==null){
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
|
||||||
|
doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
indexWriter.addDocument(doc);
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,9 +29,6 @@ class ModelPipeline implements Pipeline {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(ResultItems resultItems, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
if (resultItems.isSkip()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
|
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
|
||||||
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
|
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
|
||||||
if (o != null) {
|
if (o != null) {
|
||||||
|
|
|
@ -40,9 +40,6 @@ public class FreemarkerPipeline implements Pipeline {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(ResultItems resultItems, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
if (resultItems.isSkip()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
String path = this.path + "" + task.getUUID() + "/";
|
String path = this.path + "" + task.getUUID() + "/";
|
||||||
File file = new File(path);
|
File file = new File(path);
|
||||||
if (!file.exists()) {
|
if (!file.exists()) {
|
||||||
|
|
Loading…
Reference in New Issue