add javascript support
parent
df8ca8ad09
commit
0633ea16ef
|
@ -0,0 +1,61 @@
|
||||||
|
package us.codecraft.webmagic.processor;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
|
||||||
|
import javax.script.ScriptContext;
|
||||||
|
import javax.script.ScriptEngine;
|
||||||
|
import javax.script.ScriptEngineManager;
|
||||||
|
import javax.script.ScriptException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class JsScriptProcessor implements PageProcessor{
|
||||||
|
|
||||||
|
private ScriptEngine rubyEngine;
|
||||||
|
|
||||||
|
private String defines;
|
||||||
|
|
||||||
|
private String script;
|
||||||
|
|
||||||
|
public JsScriptProcessor(String filename){
|
||||||
|
ScriptEngineManager manager = new ScriptEngineManager();
|
||||||
|
rubyEngine = manager.getEngineByName("javascript");
|
||||||
|
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js");
|
||||||
|
try {
|
||||||
|
defines = IOUtils.toString(resourceAsStream);
|
||||||
|
resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(filename);
|
||||||
|
script = IOUtils.toString(resourceAsStream);
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
ScriptContext context = rubyEngine.getContext();
|
||||||
|
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
|
||||||
|
try {
|
||||||
|
rubyEngine.eval(defines+script, context);
|
||||||
|
} catch (ScriptException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return Site.me();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new JsScriptProcessor("js/oschina.js")).addUrl("http://my.oschina.net/flashsword/blog").run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -15,39 +15,39 @@ import java.io.InputStream;
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
*/
|
*/
|
||||||
public class ScriptProcessor implements PageProcessor{
|
public class RubyScriptProcessor implements PageProcessor{
|
||||||
|
|
||||||
private ScriptEngine rubyEngine;
|
private ScriptEngine rubyEngine;
|
||||||
|
|
||||||
private String defines;
|
private String defines;
|
||||||
|
|
||||||
ScriptProcessor(){
|
private String script;
|
||||||
|
|
||||||
|
public RubyScriptProcessor(String filename){
|
||||||
ScriptEngineManager manager = new ScriptEngineManager();
|
ScriptEngineManager manager = new ScriptEngineManager();
|
||||||
rubyEngine = manager.getEngineByName("jruby");
|
rubyEngine = manager.getEngineByName("jruby");
|
||||||
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/defines.rb");
|
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/defines.rb");
|
||||||
try {
|
try {
|
||||||
defines = IOUtils.toString(resourceAsStream);
|
defines = IOUtils.toString(resourceAsStream);
|
||||||
|
resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(filename);
|
||||||
|
script = IOUtils.toString(resourceAsStream);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
ScriptContext context = rubyEngine.getContext();
|
ScriptContext context = rubyEngine.getContext();
|
||||||
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
|
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
|
||||||
String script;
|
|
||||||
try {
|
try {
|
||||||
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/oschina.rb");
|
|
||||||
try {
|
|
||||||
script = IOUtils.toString(resourceAsStream);
|
|
||||||
rubyEngine.eval(defines+script, context);
|
rubyEngine.eval(defines+script, context);
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
} catch (ScriptException e) {
|
} catch (ScriptException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -56,6 +56,6 @@ public class ScriptProcessor implements PageProcessor{
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new ScriptProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run();
|
Spider.create(new RubyScriptProcessor("ruby/oschina.rb")).addUrl("http://my.oschina.net/flashsword/blog").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,10 @@
|
||||||
|
function $(str){
|
||||||
|
return page.getHtml().$(str).toString();
|
||||||
|
}
|
||||||
|
function xpath(str){
|
||||||
|
return page.getHtml().xpath(str).toString();
|
||||||
|
}
|
||||||
|
function urls(str){
|
||||||
|
links = page.getHtml().links().regex(str).all();
|
||||||
|
page.addTargetRequests(links);
|
||||||
|
}
|
|
@ -0,0 +1,9 @@
|
||||||
|
var result = {
|
||||||
|
title: $("div.BlogTitle h1"),
|
||||||
|
content: $("div.BlogContent")
|
||||||
|
}
|
||||||
|
var config = {
|
||||||
|
ua: '',
|
||||||
|
sleepTime : 20
|
||||||
|
}
|
||||||
|
urls("http://my\\.oschina\\.net/flashsword/blog/\\d+")
|
|
@ -0,0 +1,25 @@
|
||||||
|
package us.codecraft.webmagic.js;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import javax.script.ScriptContext;
|
||||||
|
import javax.script.ScriptEngine;
|
||||||
|
import javax.script.ScriptEngineManager;
|
||||||
|
import javax.script.ScriptException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
*/
|
||||||
|
public class TestJsCall {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws ScriptException {
|
||||||
|
ScriptEngineManager manager = new ScriptEngineManager();
|
||||||
|
ScriptEngine rubyEngine = manager.getEngineByName("javascript");
|
||||||
|
ScriptContext context = rubyEngine.getContext();
|
||||||
|
|
||||||
|
context.setAttribute("a", "sad", ScriptContext.ENGINE_SCOPE);
|
||||||
|
// rubyEngine.eval("", context);
|
||||||
|
rubyEngine.eval("print(a)", context);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue