From c3183252ac1ba7072213bda2d3fd4311d5204c2b Mon Sep 17 00:00:00 2001 From: zhuyue Date: Wed, 3 May 2017 18:24:19 +0800 Subject: [PATCH 1/4] Update RegexSelector.java --- .../webmagic/selector/RegexSelector.java | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 584cf90..bac8167 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -45,24 +45,14 @@ public class RegexSelector implements Selector { } private boolean hasGroup(String regexStr) { - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){ - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) { + int x = StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\("); + int a = StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:"); + int b = StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?="); + int c = StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<"); + int d = StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!"); + int e = StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#"); + + if (x == (a + b + c + d + e)) { return false; } return true; From 0c359a2bdebab19e34e40e7c052c6ff19d0695f3 Mon Sep 17 00:00:00 2001 From: zhuyue Date: Wed, 3 May 2017 18:24:41 +0800 Subject: [PATCH 2/4] Update RegexSelector.java --- .../webmagic/selector/RegexSelector.java | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 584cf90..bac8167 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -45,24 +45,14 @@ public class RegexSelector implements Selector { } private boolean hasGroup(String regexStr) { - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){ - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) { + int x = StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\("); + int a = StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:"); + int b = StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?="); + int c = StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<"); + int d = StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!"); + int e = StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#"); + + if (x == (a + b + c + d + e)) { return false; } return true; From c80f25edbd3a9c60ac07595847f3023cac5b868e Mon Sep 17 00:00:00 2001 From: zhuyue Date: Wed, 3 May 2017 18:33:23 +0800 Subject: [PATCH 3/4] Update RegexSelectorTest.java MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 简单的增加了一点测试 --- .../us/codecraft/webmagic/selector/RegexSelectorTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 144e6fe..871caa1 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -25,8 +25,8 @@ public class RegexSelectorTest { @Test public void testRegexWithZeroWidthAssertions() { - String regex = "^.*(?=\\?)"; - String source = "hello world?xxxx"; + String regex = "^.*(?=\\?)(?!\\?yy)"; + String source = "hello world?xx?yy"; RegexSelector regexSelector = new RegexSelector(regex); String select = regexSelector.select(source); Assertions.assertThat(select).isEqualTo("hello world"); From 9e1b7ed3f7ea40f9a29023be82b1f22eff6a389e Mon Sep 17 00:00:00 2001 From: zhuyue Date: Fri, 5 May 2017 10:47:10 +0800 Subject: [PATCH 4/4] Update RegexSelector.java --- .../webmagic/selector/RegexSelector.java | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index bac8167..1af6395 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -26,14 +26,16 @@ public class RegexSelector implements Selector { if (StringUtils.isBlank(regexStr)) { throw new IllegalArgumentException("regex must not be empty"); } - // Check bracket for regex group. Add default group 1 if there is no group. - // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. - if ( ! hasGroup(regexStr) ){ - regexStr = "(" + regexStr + ")"; - } - this.regexStr = regexStr; + try { regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + // Check bracket for regex group. Add default group 1 if there is no group. + // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. + if ( regex.matcher("").groupCount() == 0 ){ + regexStr = "(" + regexStr + ")"; + regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + } + this.regexStr = regexStr; } catch (PatternSyntaxException e) { throw new IllegalArgumentException("invalid regex", e); } @@ -44,20 +46,6 @@ public class RegexSelector implements Selector { this(regexStr, 1); } - private boolean hasGroup(String regexStr) { - int x = StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\("); - int a = StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:"); - int b = StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?="); - int c = StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<"); - int d = StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!"); - int e = StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#"); - - if (x == (a + b + c + d + e)) { - return false; - } - return true; - } - @Override public String select(String text) { return selectGroup(text).get(group);