From 9866297ec4a240eb9fbfef34ccb3d29e3d5d6499 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 14 Aug 2014 08:04:56 +0800 Subject: [PATCH] Disable jsoup entity escape by Default. Set Html.DISABLE_HTML_ENTITY_ESCAPE to false to enable it. #149 --- .../us/codecraft/webmagic/selector/Html.java | 20 +++++++++++++++++++ .../java/us/codecraft/webmagic/HtmlTest.java | 18 ++++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 7b593ed..1ccd67c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.nodes.Entities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -19,6 +20,24 @@ public class Html extends HtmlNode { private Logger logger = LoggerFactory.getLogger(getClass()); + private static volatile boolean INITED = false; + + /** + * Disable jsoup html entity escape. It can be set just before any Html instance is created. + */ + public static boolean DISABLE_HTML_ENTITY_ESCAPE = true; + + /** + * Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2. + */ + private void disableJsoupHtmlEntityEscape() { + if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) { + Entities.EscapeMode.base.getMap().clear(); + Entities.EscapeMode.extended.getMap().clear(); + INITED = true; + } + } + /** * Store parsed document for better performance when only one text exist. */ @@ -26,6 +45,7 @@ public class Html extends HtmlNode { public Html(String text) { try { + disableJsoupHtmlEntityEscape(); this.document = Jsoup.parse(text); } catch (Exception e) { this.document = null; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index fa66c3a..c5780b4 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -3,6 +3,8 @@ package us.codecraft.webmagic; import org.junit.Test; import us.codecraft.webmagic.selector.Html; +import static org.assertj.core.api.Assertions.assertThat; + /** * @author code4crafter@gmail.com
* Date: 13-4-21 @@ -13,9 +15,19 @@ public class HtmlTest { @Test public void testRegexSelector() { Html selectable = new Html("aaaaaaab"); -// Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); - System.out.println(selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()); - + assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab"); } + @Test + public void testDisableJsoupHtmlEntityEscape() throws Exception { + Html html = new Html("aaaaaaa&b"); + assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b"); + } + + @Test + public void testEnableJsoupHtmlEntityEscape() throws Exception { + Html.DISABLE_HTML_ENTITY_ESCAPE = false; + Html html = new Html("aaaaaaa&b"); + assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b"); + } }