From 04fcf3193f541a5190cc194e08c0c796677cbaad Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 23 Nov 2013 13:56:55 +0800 Subject: [PATCH] #38 Change algorithm of SmartContentSelector --- .../example/OschinaBlogPageProcesser.java | 2 +- .../selector/SmartContentSelector.java | 133 ++++++++---------- 2 files changed, 61 insertions(+), 74 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java index 4ef830d..a59f1e9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -23,7 +23,7 @@ public class OschinaBlogPageProcesser implements PageProcessor { //skip this page page.setSkip(true); } - page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString()); + page.putField("content", page.getHtml().smartContent().toString()); page.putField("tags", page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index efd4e11..ff8e269 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -1,100 +1,87 @@ package us.codecraft.webmagic.selector; -import org.apache.log4j.Logger; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; import us.codecraft.webmagic.utils.Experimental; -import java.util.*; -import java.util.concurrent.atomic.AtomicInteger; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; /** - * Extract the text content of html.
- * Using Readability algorithm: find parents of all p tags. + * Borrowed from https://code.google.com/p/cx-extractor/ * * @author code4crafter@gmail.com
- * @since 0.1.0 + * @since 0.4.1 + * */ @Experimental public class SmartContentSelector implements Selector { - private Logger logger = Logger.getLogger(getClass()); - public SmartContentSelector() { } @Override - public String select(String text) { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - if (tagNode == null) { - return null; - } - TagNode[] nodes = tagNode.getElementsByName("p", true); - TagNode[] pres = tagNode.getElementsByName("pre", true); - Map pDensityCountMap = new HashMap(); - countPdensity(nodes, pDensityCountMap); - countPdensity(pres, pDensityCountMap); - for (TagNode pre : pres) { - addCounter(pre, pDensityCountMap, 2); - } - List> sortList = new ArrayList>(); - if (pDensityCountMap.size() == 0) { - return null; - } - for (Map.Entry entry : pDensityCountMap.entrySet()) { -// if (logger.isDebugEnabled()) { -// logger.debug("p\t" + entry.getKey().getName() + "#" + entry.getKey().getAttributeByName("id") + -// "@" + entry.getKey().getAttributeByName("class") + ":" + entry.getValue()); -// } - sortList.add(entry); - } + public String select(String html) { + html = html.replaceAll("(?is)", ""); + html = html.replaceAll("(?is)", ""); // remove html comment + html = html.replaceAll("(?is).*?", ""); // remove javascript + html = html.replaceAll("(?is).*?", ""); // remove css + html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char + html = html.replaceAll("(?is)<.*?>", ""); + List lines; + int blocksWidth =3; + int threshold =86; + int start; + int end; + StringBuilder text = new StringBuilder(); + ArrayList indexDistribution = new ArrayList(); - Collections.sort(sortList, new Comparator>() { - @Override - public int compare(Map.Entry o1, Map.Entry o2) { - Double d1 = o1.getValue(); - Double d2 = o2.getValue(); - return -d1.compareTo(d2); + lines = Arrays.asList(html.split("\n")); + + for (int i = 0; i < lines.size() - blocksWidth; i++) { + int wordsNum = 0; + for (int j = i; j < i + blocksWidth; j++) { + lines.set(j, lines.get(j).replaceAll("\\s+", "")); + wordsNum += lines.get(j).length(); } - }); - TagNode contentNode = sortList.get(0).getKey(); - if (logger.isDebugEnabled()) { - logger.debug("p\t" + contentNode.getName() + "#" + contentNode.getAttributeByName("id") + - "@" + contentNode.getAttributeByName("class")); + indexDistribution.add(wordsNum); } - return htmlCleaner.getInnerHtml(contentNode); - } - private void addCounter(TagNode node, Map countMap, double delta) { - Double counter = countMap.get(node); - if (counter == null) { - countMap.put(node, delta); - } else { - countMap.put(node, counter + delta); - } - } + start = -1; end = -1; + boolean boolstart = false, boolend = false; + text.setLength(0); - private static final double parentWeight = 0.7; - - private void countPdensity(TagNode[] nodes, Map pDensityCountMap) { - for (TagNode node : nodes) { - if (node == null) { - continue; + for (int i = 0; i < indexDistribution.size() - 1; i++) { + if (indexDistribution.get(i) > threshold && ! boolstart) { + if (indexDistribution.get(i+1).intValue() != 0 + || indexDistribution.get(i+2).intValue() != 0 + || indexDistribution.get(i+3).intValue() != 0) { + boolstart = true; + start = i; + continue; + } } - TagNode parent = node.getParent(); - double pDensity = 1; - while (parent != null) { - addCounter(parent, pDensityCountMap, pDensity); - parent = parent.getParent(); - pDensity = pDensity * parentWeight; + if (boolstart) { + if (indexDistribution.get(i).intValue() == 0 + || indexDistribution.get(i+1).intValue() == 0) { + end = i; + boolend = true; + } + } + StringBuilder tmp = new StringBuilder(); + if (boolend) { + //System.out.println(start+1 + "\t\t" + end+1); + for (int ii = start; ii <= end; ii++) { + if (lines.get(ii).length() < 5) continue; + tmp.append(lines.get(ii) + "\n"); + } + String str = tmp.toString(); + //System.out.println(str); + if (str.contains("Copyright") ) continue; + text.append(str); + boolstart = boolend = false; } } - } - - private TagNode findLowestCommonParent(List tagNodes, int maxMargin, Map countMap) { - TagNode contentNode = tagNodes.get(0); - return contentNode; + return text.toString(); } @Override