From 0633ea16ef621bff2f2228350fd7872dc5eb9174 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 11 Nov 2013 07:48:39 +0800 Subject: [PATCH] add javascript support --- .../webmagic/processor/JsScriptProcessor.java | 61 +++++++++++++++++++ ...rocessor.java => RubyScriptProcessor.java} | 22 +++---- .../src/main/resources/js/defines.js | 10 +++ .../src/main/resources/js/oschina.js | 9 +++ .../us/codecraft/webmagic/js/TestJsCall.java | 25 ++++++++ 5 files changed, 116 insertions(+), 11 deletions(-) create mode 100644 webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/JsScriptProcessor.java rename webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/{ScriptProcessor.java => RubyScriptProcessor.java} (70%) create mode 100644 webmagic-scripts/src/main/resources/js/defines.js create mode 100644 webmagic-scripts/src/main/resources/js/oschina.js create mode 100644 webmagic-scripts/src/test/java/us/codecraft/webmagic/js/TestJsCall.java diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/JsScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/JsScriptProcessor.java new file mode 100644 index 0000000..51ec04e --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/JsScriptProcessor.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.processor; + +import org.apache.commons.io.IOUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; + +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; +import java.io.IOException; +import java.io.InputStream; + +/** + * @author code4crafter@gmail.com + */ +public class JsScriptProcessor implements PageProcessor{ + + private ScriptEngine rubyEngine; + + private String defines; + + private String script; + + public JsScriptProcessor(String filename){ + ScriptEngineManager manager = new ScriptEngineManager(); + rubyEngine = manager.getEngineByName("javascript"); + InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("js/defines.js"); + try { + defines = IOUtils.toString(resourceAsStream); + resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(filename); + script = IOUtils.toString(resourceAsStream); + } catch (IOException e) { + e.printStackTrace(); + } + + + } + + @Override + public void process(Page page) { + ScriptContext context = rubyEngine.getContext(); + context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); + try { + rubyEngine.eval(defines+script, context); + } catch (ScriptException e) { + e.printStackTrace(); + } + + } + + @Override + public Site getSite() { + return Site.me(); + } + + public static void main(String[] args) { + Spider.create(new JsScriptProcessor("js/oschina.js")).addUrl("http://my.oschina.net/flashsword/blog").run(); + } +} diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/RubyScriptProcessor.java similarity index 70% rename from webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java rename to webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/RubyScriptProcessor.java index b821ae4..cf6801c 100644 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/ScriptProcessor.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/processor/RubyScriptProcessor.java @@ -15,39 +15,39 @@ import java.io.InputStream; /** * @author code4crafter@gmail.com */ -public class ScriptProcessor implements PageProcessor{ +public class RubyScriptProcessor implements PageProcessor{ private ScriptEngine rubyEngine; private String defines; - ScriptProcessor(){ + private String script; + + public RubyScriptProcessor(String filename){ ScriptEngineManager manager = new ScriptEngineManager(); rubyEngine = manager.getEngineByName("jruby"); InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/defines.rb"); try { defines = IOUtils.toString(resourceAsStream); + resourceAsStream = this.getClass().getClassLoader().getResourceAsStream(filename); + script = IOUtils.toString(resourceAsStream); } catch (IOException e) { e.printStackTrace(); } + + } @Override public void process(Page page) { ScriptContext context = rubyEngine.getContext(); context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE); - String script; try { - InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/oschina.rb"); - try { - script = IOUtils.toString(resourceAsStream); - rubyEngine.eval(defines+script, context); - } catch (IOException e) { - e.printStackTrace(); - } + rubyEngine.eval(defines+script, context); } catch (ScriptException e) { e.printStackTrace(); } + } @Override @@ -56,6 +56,6 @@ public class ScriptProcessor implements PageProcessor{ } public static void main(String[] args) { - Spider.create(new ScriptProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run(); + Spider.create(new RubyScriptProcessor("ruby/oschina.rb")).addUrl("http://my.oschina.net/flashsword/blog").run(); } } diff --git a/webmagic-scripts/src/main/resources/js/defines.js b/webmagic-scripts/src/main/resources/js/defines.js new file mode 100644 index 0000000..687edb3 --- /dev/null +++ b/webmagic-scripts/src/main/resources/js/defines.js @@ -0,0 +1,10 @@ +function $(str){ + return page.getHtml().$(str).toString(); +} +function xpath(str){ + return page.getHtml().xpath(str).toString(); +} +function urls(str){ + links = page.getHtml().links().regex(str).all(); + page.addTargetRequests(links); +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/resources/js/oschina.js b/webmagic-scripts/src/main/resources/js/oschina.js new file mode 100644 index 0000000..b3fc11a --- /dev/null +++ b/webmagic-scripts/src/main/resources/js/oschina.js @@ -0,0 +1,9 @@ +var result = { + title: $("div.BlogTitle h1"), + content: $("div.BlogContent") +} +var config = { + ua: '', + sleepTime : 20 +} +urls("http://my\\.oschina\\.net/flashsword/blog/\\d+") \ No newline at end of file diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/js/TestJsCall.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/js/TestJsCall.java new file mode 100644 index 0000000..9b4ceeb --- /dev/null +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/js/TestJsCall.java @@ -0,0 +1,25 @@ +package us.codecraft.webmagic.js; + +import org.junit.Test; + +import javax.script.ScriptContext; +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; + +/** + * @author code4crafter@gmail.com + */ +public class TestJsCall { + + @Test + public void test() throws ScriptException { + ScriptEngineManager manager = new ScriptEngineManager(); + ScriptEngine rubyEngine = manager.getEngineByName("javascript"); + ScriptContext context = rubyEngine.getContext(); + + context.setAttribute("a", "sad", ScriptContext.ENGINE_SCOPE); +// rubyEngine.eval("", context); + rubyEngine.eval("print(a)", context); + } +}