add scripts
parent
c2e04ea5a0
commit
df8ca8ad09
Binary file not shown.
After Width: | Height: | Size: 8.5 KiB |
1
pom.xml
1
pom.xml
|
@ -48,6 +48,7 @@
|
|||
<modules>
|
||||
<module>webmagic-core</module>
|
||||
<module>webmagic-extension/</module>
|
||||
<module>webmagic-scripts</module>
|
||||
</modules>
|
||||
|
||||
<dependencyManagement>
|
||||
|
|
|
@ -45,6 +45,16 @@ public class PlainText implements Selectable {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable css(String selector) {
|
||||
return $(selector);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable css(String selector, String attrName) {
|
||||
return $(selector, attrName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable smartContent() {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -35,6 +35,23 @@ public interface Selectable {
|
|||
*/
|
||||
public Selectable $(String selector, String attrName);
|
||||
|
||||
/**
|
||||
* select list with css selector
|
||||
*
|
||||
* @param selector css selector expression
|
||||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable css(String selector);
|
||||
|
||||
/**
|
||||
* select list with css selector
|
||||
*
|
||||
* @param selector css selector expression
|
||||
* @param attrName attribute name of css selector
|
||||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable css(String selector, String attrName);
|
||||
|
||||
/**
|
||||
* select smart content with ReadAbility algorithm
|
||||
*
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-scripts</artifactId>
|
||||
<version>0.4.1-SNAPSHOT</version>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.jruby</groupId>
|
||||
<artifactId>jruby</artifactId>
|
||||
<version>1.7.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,61 @@
|
|||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
|
||||
import javax.script.ScriptContext;
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptEngineManager;
|
||||
import javax.script.ScriptException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class ScriptProcessor implements PageProcessor{
|
||||
|
||||
private ScriptEngine rubyEngine;
|
||||
|
||||
private String defines;
|
||||
|
||||
ScriptProcessor(){
|
||||
ScriptEngineManager manager = new ScriptEngineManager();
|
||||
rubyEngine = manager.getEngineByName("jruby");
|
||||
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/defines.rb");
|
||||
try {
|
||||
defines = IOUtils.toString(resourceAsStream);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
ScriptContext context = rubyEngine.getContext();
|
||||
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
|
||||
String script;
|
||||
try {
|
||||
InputStream resourceAsStream = this.getClass().getClassLoader().getResourceAsStream("ruby/oschina.rb");
|
||||
try {
|
||||
script = IOUtils.toString(resourceAsStream);
|
||||
rubyEngine.eval(defines+script, context);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
} catch (ScriptException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new ScriptProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
def xpath str
|
||||
$page.getHtml().xpath(str).toString()
|
||||
end
|
||||
def css str
|
||||
$page.getHtml().css(str).toString()
|
||||
end
|
||||
def urls str
|
||||
links = $page.getHtml().links().regex(str).all();
|
||||
$page.addTargetRequests(links);
|
||||
end
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
title = css "div.BlogTitle h1"
|
||||
content = css "div.BlogContent"
|
||||
urls "http://my\\.oschina\\.net/flashsword/blog/\\d+"
|
||||
puts title
|
||||
puts content
|
|
@ -0,0 +1,25 @@
|
|||
package us.codecraft.webmagic.jruby;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import javax.script.ScriptContext;
|
||||
import javax.script.ScriptEngine;
|
||||
import javax.script.ScriptEngineManager;
|
||||
import javax.script.ScriptException;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class TestJRubyCall {
|
||||
|
||||
@Test
|
||||
public void test() throws ScriptException {
|
||||
ScriptEngineManager manager = new ScriptEngineManager();
|
||||
ScriptEngine rubyEngine = manager.getEngineByName("jruby");
|
||||
ScriptContext context = rubyEngine.getContext();
|
||||
|
||||
context.setAttribute("a", "sad", ScriptContext.ENGINE_SCOPE);
|
||||
// rubyEngine.eval("", context);
|
||||
rubyEngine.eval("b=1; puts b", context);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue