update the script
parent
c1e7207869
commit
37666a7151
|
@ -16,6 +16,10 @@
|
|||
<artifactId>jruby</artifactId>
|
||||
<version>1.7.6</version>
|
||||
</dependency>
|
||||
<dependency><groupId>org.python</groupId>
|
||||
<artifactId>jython</artifactId>
|
||||
<version>2.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
|
|
|
@ -7,7 +7,9 @@ public enum Language {
|
|||
|
||||
JavaScript("javascript","js/defines.js",""),
|
||||
|
||||
JRuby("jruby","ruby/defines.rb","");
|
||||
JRuby("jruby","ruby/defines.rb",""),
|
||||
|
||||
Jython("jython","python/defines.py","");
|
||||
|
||||
private String engineName;
|
||||
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
package us.codecraft.webmagic.scripts;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.jruby.RubyHash;
|
||||
import org.python.core.PyDictionary;
|
||||
import sun.org.mozilla.javascript.internal.NativeObject;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
@ -10,6 +13,8 @@ import javax.script.ScriptEngine;
|
|||
import javax.script.ScriptException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
|
@ -50,20 +55,34 @@ public class ScriptProcessor implements PageProcessor {
|
|||
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
|
||||
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
|
||||
try {
|
||||
switch (language) {
|
||||
case JavaScript:
|
||||
engine.eval(defines + "\n" + script, context);
|
||||
// switch (language) {
|
||||
// case JavaScript:
|
||||
// NativeObject o = (NativeObject) engine.get("result");
|
||||
// if (o != null) {
|
||||
// for (Map.Entry<Object, Object> objectObjectEntry : o.entrySet()) {
|
||||
// page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue());
|
||||
// }
|
||||
// }
|
||||
// break;
|
||||
// case JRuby:
|
||||
// Object o1 = engine.get("result");
|
||||
// break;
|
||||
// }
|
||||
NativeObject o = (NativeObject) engine.get("result");
|
||||
if (o != null) {
|
||||
for (Map.Entry<Object, Object> objectObjectEntry : o.entrySet()) {
|
||||
page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue());
|
||||
}
|
||||
}
|
||||
break;
|
||||
case JRuby:
|
||||
RubyHash oRuby=(RubyHash)engine.eval(defines+"\n"+script,context);
|
||||
Iterator itruby = oRuby.entrySet().iterator();
|
||||
while (itruby.hasNext()) {
|
||||
Map.Entry pairs = (Map.Entry)itruby.next();
|
||||
page.getResultItems().put(pairs.getKey().toString(),pairs.getValue());
|
||||
}
|
||||
break;
|
||||
case Jython:
|
||||
engine.eval(defines + "\n" + script, context);
|
||||
PyDictionary oJython=(PyDictionary)engine.get("result");
|
||||
Iterator it = oJython.entrySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Map.Entry pairs = (Map.Entry)it.next();
|
||||
page.getResultItems().put(pairs.getKey().toString(),pairs.getValue());
|
||||
}
|
||||
break;
|
||||
}
|
||||
} catch (ScriptException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
@ -72,6 +91,7 @@ public class ScriptProcessor implements PageProcessor {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
def xpath(str):
|
||||
return page.getHtml().xpath(str).toString()
|
||||
|
||||
def css(str):
|
||||
return page.getHtml().css(str).toString()
|
||||
|
||||
def urls(str):
|
||||
links=page.getHtml().links().regex(str).all()
|
||||
page.addTargetRequests(links);
|
||||
|
||||
def tomap(key,value):
|
||||
return "hello world"
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
title=xpath("div[@class=BlogTitle]")
|
||||
urls="http://my\\.oschina\\.net/flashsword/blog/\\d+"
|
||||
|
||||
result={"title":title,"urls":urls}
|
|
@ -22,4 +22,12 @@ public class ScriptProcessorTest {
|
|||
pageProcessor.getSite().setSleepTime(0);
|
||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testPythonProcessor() {
|
||||
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build();
|
||||
pageProcessor.getSite().setSleepTime(0);
|
||||
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue