Disable jsoup entity escape by Default. Set Html.DISABLE_HTML_ENTITY_ESCAPE to false to enable it. #149
parent
4e6e946dd7
commit
9866297ec4
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
|
|||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Entities;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -19,6 +20,24 @@ public class Html extends HtmlNode {
|
|||
|
||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private static volatile boolean INITED = false;
|
||||
|
||||
/**
|
||||
* Disable jsoup html entity escape. It can be set just before any Html instance is created.
|
||||
*/
|
||||
public static boolean DISABLE_HTML_ENTITY_ESCAPE = true;
|
||||
|
||||
/**
|
||||
* Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2.
|
||||
*/
|
||||
private void disableJsoupHtmlEntityEscape() {
|
||||
if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
|
||||
Entities.EscapeMode.base.getMap().clear();
|
||||
Entities.EscapeMode.extended.getMap().clear();
|
||||
INITED = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Store parsed document for better performance when only one text exist.
|
||||
*/
|
||||
|
@ -26,6 +45,7 @@ public class Html extends HtmlNode {
|
|||
|
||||
public Html(String text) {
|
||||
try {
|
||||
disableJsoupHtmlEntityEscape();
|
||||
this.document = Jsoup.parse(text);
|
||||
} catch (Exception e) {
|
||||
this.document = null;
|
||||
|
|
|
@ -3,6 +3,8 @@ package us.codecraft.webmagic;
|
|||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
|
@ -13,9 +15,19 @@ public class HtmlTest {
|
|||
@Test
|
||||
public void testRegexSelector() {
|
||||
Html selectable = new Html("aaaaaaab");
|
||||
// Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()));
|
||||
System.out.println(selectable.regex("(.*)").replace("aa(a)", "$1bb").toString());
|
||||
|
||||
assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDisableJsoupHtmlEntityEscape() throws Exception {
|
||||
Html html = new Html("aaaaaaa&b");
|
||||
assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEnableJsoupHtmlEntityEscape() throws Exception {
|
||||
Html.DISABLE_HTML_ENTITY_ESCAPE = false;
|
||||
Html html = new Html("aaaaaaa&b");
|
||||
assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue