Disable jsoup entity escape by Default. Set Html.DISABLE_HTML_ENTITY_ESCAPE to false to enable it. #149
parent
4e6e946dd7
commit
9866297ec4
|
@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.nodes.Entities;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -19,6 +20,24 @@ public class Html extends HtmlNode {
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private static volatile boolean INITED = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Disable jsoup html entity escape. It can be set just before any Html instance is created.
|
||||||
|
*/
|
||||||
|
public static boolean DISABLE_HTML_ENTITY_ESCAPE = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2.
|
||||||
|
*/
|
||||||
|
private void disableJsoupHtmlEntityEscape() {
|
||||||
|
if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
|
||||||
|
Entities.EscapeMode.base.getMap().clear();
|
||||||
|
Entities.EscapeMode.extended.getMap().clear();
|
||||||
|
INITED = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Store parsed document for better performance when only one text exist.
|
* Store parsed document for better performance when only one text exist.
|
||||||
*/
|
*/
|
||||||
|
@ -26,6 +45,7 @@ public class Html extends HtmlNode {
|
||||||
|
|
||||||
public Html(String text) {
|
public Html(String text) {
|
||||||
try {
|
try {
|
||||||
|
disableJsoupHtmlEntityEscape();
|
||||||
this.document = Jsoup.parse(text);
|
this.document = Jsoup.parse(text);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
this.document = null;
|
this.document = null;
|
||||||
|
|
|
@ -3,6 +3,8 @@ package us.codecraft.webmagic;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
|
@ -13,9 +15,19 @@ public class HtmlTest {
|
||||||
@Test
|
@Test
|
||||||
public void testRegexSelector() {
|
public void testRegexSelector() {
|
||||||
Html selectable = new Html("aaaaaaab");
|
Html selectable = new Html("aaaaaaab");
|
||||||
// Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString()));
|
assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab");
|
||||||
System.out.println(selectable.regex("(.*)").replace("aa(a)", "$1bb").toString());
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDisableJsoupHtmlEntityEscape() throws Exception {
|
||||||
|
Html html = new Html("aaaaaaa&b");
|
||||||
|
assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEnableJsoupHtmlEntityEscape() throws Exception {
|
||||||
|
Html.DISABLE_HTML_ENTITY_ESCAPE = false;
|
||||||
|
Html html = new Html("aaaaaaa&b");
|
||||||
|
assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue