Merge pull request #1086 from vioao/enhance_jsoup_parse_table
Enhance Jsoup could parse tr td tag directlymaster
commit
69accb656a
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -11,11 +12,24 @@ import java.util.List;
|
||||||
* @since 0.3.0
|
* @since 0.3.0
|
||||||
*/
|
*/
|
||||||
public abstract class BaseElementSelector implements Selector, ElementSelector {
|
public abstract class BaseElementSelector implements Selector, ElementSelector {
|
||||||
|
private Document parse(String text) {
|
||||||
|
if (text == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Jsoup could not parse <tr></tr> or <td></td> tag directly
|
||||||
|
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
|
||||||
|
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
|
||||||
|
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
|
||||||
|
text = "<table>" + text + "</table>";
|
||||||
|
}
|
||||||
|
return Jsoup.parse(text);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String select(String text) {
|
public String select(String text) {
|
||||||
if (text != null) {
|
if (text != null) {
|
||||||
return select(Jsoup.parse(text));
|
return select(parse(text));
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -23,7 +37,7 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(String text) {
|
public List<String> selectList(String text) {
|
||||||
if (text != null) {
|
if (text != null) {
|
||||||
return selectList(Jsoup.parse(text));
|
return selectList(parse(text));
|
||||||
} else {
|
} else {
|
||||||
return new ArrayList<String>();
|
return new ArrayList<String>();
|
||||||
}
|
}
|
||||||
|
@ -31,14 +45,14 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
|
||||||
|
|
||||||
public Element selectElement(String text) {
|
public Element selectElement(String text) {
|
||||||
if (text != null) {
|
if (text != null) {
|
||||||
return selectElement(Jsoup.parse(text));
|
return selectElement(parse(text));
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Element> selectElements(String text) {
|
public List<Element> selectElements(String text) {
|
||||||
if (text != null) {
|
if (text != null) {
|
||||||
return selectElements(Jsoup.parse(text));
|
return selectElements(parse(text));
|
||||||
} else {
|
} else {
|
||||||
return new ArrayList<Element>();
|
return new ArrayList<Element>();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue