From b9eeb88f7773a8de62a86f2b68ae2a31994c29e4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 07:51:18 +0800 Subject: [PATCH] benchmark --- .../webmagic/model/ProcessorBenchmark.java | 891 ++++++++++++++++++ 1 file changed, 891 insertions(+) create mode 100644 webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java new file mode 100644 index 0000000..5513305 --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java @@ -0,0 +1,891 @@ +package us.codecraft.webmagic.model; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * @author code4crafter@gmail.com + */ +public class ProcessorBenchmark { + + @Ignore + @Test + public void test() { + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class); + Page page = new Page(); + page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); + page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); + page.setHtml(new Html(html)); + long time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + } + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t\t
\n" + + " \t开源中国社区\n" + + "
\n" + + "
开源项目发现、使用和交流平台
\n" + + "\t\t
\n" + + " \t\n" + + "
\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\t\t
\n" + + "\t\t当前访客身份:\n" + + "\t\t\t\t黄亿华 [ 退出 ]\n" + + "\t\t\t\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t你有0新留言\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\n" + + "\t\t
\n" + + "\t\t
\n" + + " \t\t
\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + + " \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t
\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + "
软件
\n" + + " \n" + + "
\n" + + "\t\t\t\t\t\t\t\n" + + " \t\t
\n" + + "\t\t
\n" + + "\t\t
\n" + + "\t
\n" + + "\t
\t\n" + + "\n" + + "
\n" + + "
\n" + + "\t\t切换风格 \"黄亿华\"\n" + + " \n" + + " 黄亿华\n" + + "\t\t\n" + + "\t\t\t\n" + + " \t\t\t修改资料\n" + + "\t\t\t更换头像\n" + + " \t\t\n" + + " \n" + + "
\n" + + "
\n" + + " \t关注(43)\n" + + " \t粉丝(98)\n" + + " \t积分(173)\n" + + "
\n" + + "
\n" + + "
\n" + + "码农一枚
实用主义者
抵制重复造轮子,却造了不少轮子
http://codecraft.us
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t.发表博文\n" + + "\t.空间管理\n" + + "
\n" + + " 管理» 博客分类\n" + + " \n" + + "
\n" + + "
\n" + + " 管理» 最新评论 \n" + + "
    \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“searchjack”的评论 不是好的就会被认...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:不是好的就会被认可, 干自己的, 到时候, 单干\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:极好的工具,\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@静风流云:貌似,OSC也是类似处理的。\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t
\n" + + "
\n" + + "
\n" + + "访客统计\n" + + "
    \n" + + "\t
  • 6 (查看最新访客»)
  • \n" + + "
  • 284
  • \n" + + "
  • 817
  • \n" + + "
  • 1888
  • \n" + + "
  • 16453
  • \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\t
\n" + + " \t\n" + + "\t
\n" + + "\t\n" + + " \t
\t\t\n" + + "
\n" + + "

Jsoup代码解读之八-防御XSS攻击

\n" + + "
\n" + + " \t\t \t\t \t\t\n" + + " \t\t\t编辑 | 删除\n" + + " \t\t\n" + + "\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n" + + " \t\t已有1628次阅读 ,共3个评论\n" + + " \t\t\t\t\t,共 79 人收藏此文 \t
\n" + + "
\n" + + "\t \t
\n" + + "

目录:[ - ]

\n" + + " \n" + + " \t
\n" + + " \n" + + "\t \t

\n" + + "\n" + + "

防御XSS攻击的一般原理

\n" + + "

cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。

\n" + + "

我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<,>,",'是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。

\n" + + "

在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:

\n" + + "
    \n" + + "
  1. 将HTML解析为DOM树

    这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用</textarea>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。

  2. \n" + + "
  3. 过滤高风险标签/属性/属性值

    高风险标签是指<script>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如onclick='alert("xss!")'

  4. \n" + + "
  5. 重新将DOM树输出为HTML文本

    DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。

  6. \n" + + "
\n" + + "\n" + + "

Cleaner与Whitelist

\n" + + "

对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。

\n" + + "

Jsoup给出的答案是白名单。下面是Whitelist的部分代码。

\n" + + "
public class Whitelist {\n" +
+            "    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n" +
+            "    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n" +
+            "    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n" +
+            "    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n" +
+            "    private boolean preserveRelativeLinks; // option to preserve relative links\n" +
+            "}
\n" + + "

这里定义了标签名/属性名/属性值的白名单。

\n" + + "

Cleaner是过滤的执行者。不出所料,Cleaner内部定义了CleaningVisitor来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到Element destination里去。

\n" + + "
private final class CleaningVisitor implements NodeVisitor {\n" +
+            "    private int numDiscarded = 0;\n" +
+            "    private final Element root;\n" +
+            "    private Element destination; // current element to append nodes to\n" +
+            "\n" +
+            "    private CleaningVisitor(Element root, Element destination) {\n" +
+            "        this.root = root;\n" +
+            "        this.destination = destination;\n" +
+            "    }\n" +
+            "\n" +
+            "    public void head(Node source, int depth) {\n" +
+            "        if (source instanceof Element) {\n" +
+            "            Element sourceEl = (Element) source;\n" +
+            "\n" +
+            "            if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n" +
+            "                ElementMeta meta = createSafeElement(sourceEl);\n" +
+            "                Element destChild = meta.el;\n" +
+            "                destination.appendChild(destChild);\n" +
+            "\n" +
+            "                numDiscarded += meta.numAttribsDiscarded;\n" +
+            "                destination = destChild;\n" +
+            "            } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n" +
+            "                numDiscarded++;\n" +
+            "            }\n" +
+            "        } else if (source instanceof TextNode) {\n" +
+            "            TextNode sourceText = (TextNode) source;\n" +
+            "            TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n" +
+            "            destination.appendChild(destText);\n" +
+            "        } else { // else, we don't care about comments, xml proc instructions, etc\n" +
+            "            numDiscarded++;\n" +
+            "        }\n" +
+            "    }\n" +
+            "\n" +
+            "    public void tail(Node source, int depth) {\n" +
+            "        if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n" +
+            "            destination = destination.parent(); // would have descended, so pop destination stack\n" +
+            "        }\n" +
+            "    }\n" +
+            "}
\n" + + "\n" + + "

结束语

\n" + + "

至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:

\n" + + "
    \n" + + "
  • 最好的代码抽象,是对现实概念的映射。

    这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。

  • \n" + + "
  • 不要过度抽象

    在Jsoup里,只用到了两个接口,一个是NodeVisitor,一个是Connection,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。

    另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。

  • \n" + + "
\n" + + "

最后继续贴上Jsoup解读系列的github地址:https://github.com/code4craft/jsoup-learning/

\n" + + " \t \t \n" + + " \t\n" + + "\t
\n" + + " \t关键字:\n" + + " \t \tJsoup\n" + + " \t \tXSS\n" + + " \t \tOO\n" + + " \t \t
\n" + + "\t \t \n" + + "
\t\t\n" + + "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + + "\t \t
\n" + + "\n" + + " \n" + + "\t
\n" + + "\n" + + "\t\n" + + "\t
\n" + + "\t\n" + + "\t\n" + + "\t\t分享到: \n" + + "\t\t\n" + + "\t\t\n" + + "\t\n" + + " 已有 0人顶\n" + + "\t\n" + + "\t
\n" + + "\t\t\n" + + "
\n" + + "
\n" + + "
\n" + + "

共有 3 条网友评论

\n" + + "\t\t\t
    \n" + + "\t\t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"静风流云\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t1楼:静风流云 发表于 2013-09-01 08:34 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    貌似,OSC也是类似处理的。
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"黄亿华\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t2楼:黄亿华 发表于 2013-09-01 08:37 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t
    \n" + + "\t\t

    引用来自“静风流云”的评论

    貌似,OSC也是类似处理的。

    OSC就是使用Jsoup做解析的,见这里:http://www.oschina.net/p/jsoup
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"searchjack\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t3楼:searchjack 发表于 2013-09-02 09:20 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    极好的工具,
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t
\n" + + "
\n" + + "\t
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\t \n" + + "\t \n" + + "\t 文明上网,理性发言\n" + + "
\n" + + "\t回到页首 | 回到评论列表\n" + + "
\n" + + "
\n" + + "\t\n" + + "
\n" + + "\t关闭相关文章阅读\n" + + "\t\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + ""; +}