Check valid left parenthesis
parent
0c3ff3d6b1
commit
83926970b2
|
@ -1,93 +1,102 @@
|
|||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
/**
|
||||
* Selector in regex.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.1.0
|
||||
*/
|
||||
public class RegexSelector implements Selector {
|
||||
|
||||
private String regexStr;
|
||||
|
||||
private Pattern regex;
|
||||
|
||||
private int group = 1;
|
||||
|
||||
public RegexSelector(String regexStr, int group) {
|
||||
if (StringUtils.isBlank(regexStr)) {
|
||||
throw new IllegalArgumentException("regex must not be empty");
|
||||
}
|
||||
if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) {
|
||||
regexStr = "(" + regexStr + ")";
|
||||
}
|
||||
if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) {
|
||||
throw new IllegalArgumentException("regex must have capture group 1");
|
||||
}
|
||||
this.regexStr = regexStr;
|
||||
try {
|
||||
regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) {
|
||||
throw new IllegalArgumentException("invalid regex", e);
|
||||
}
|
||||
this.group = group;
|
||||
}
|
||||
|
||||
public RegexSelector(String regexStr) {
|
||||
this(regexStr, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
return selectGroup(text).get(group);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> selectList(String text) {
|
||||
List<String> strings = new ArrayList<String>();
|
||||
List<RegexResult> results = selectGroupList(text);
|
||||
for (RegexResult result : results) {
|
||||
strings.add(result.get(group));
|
||||
}
|
||||
return strings;
|
||||
}
|
||||
|
||||
public RegexResult selectGroup(String text) {
|
||||
Matcher matcher = regex.matcher(text);
|
||||
if (matcher.find()) {
|
||||
String[] groups = new String[matcher.groupCount() + 1];
|
||||
for (int i = 0; i < groups.length; i++) {
|
||||
groups[i] = matcher.group(i);
|
||||
}
|
||||
return new RegexResult(groups);
|
||||
}
|
||||
return RegexResult.EMPTY_RESULT;
|
||||
}
|
||||
|
||||
public List<RegexResult> selectGroupList(String text) {
|
||||
Matcher matcher = regex.matcher(text);
|
||||
List<RegexResult> resultList = new ArrayList<RegexResult>();
|
||||
while (matcher.find()) {
|
||||
String[] groups = new String[matcher.groupCount() + 1];
|
||||
for (int i = 0; i < groups.length; i++) {
|
||||
groups[i] = matcher.group(i);
|
||||
}
|
||||
resultList.add(new RegexResult(groups));
|
||||
}
|
||||
return resultList;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return regexStr;
|
||||
}
|
||||
|
||||
}
|
||||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
/**
|
||||
* Selector in regex.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.1.0
|
||||
*/
|
||||
public class RegexSelector implements Selector {
|
||||
|
||||
private String regexStr;
|
||||
|
||||
private Pattern regex;
|
||||
|
||||
private int group = 1;
|
||||
|
||||
public RegexSelector(String regexStr, int group) {
|
||||
if (StringUtils.isBlank(regexStr)) {
|
||||
throw new IllegalArgumentException("regex must not be empty");
|
||||
}
|
||||
/* Can't detect '\(', '(?:)' so that would be result in ArrayIndexOutOfBoundsException
|
||||
if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) {
|
||||
regexStr = "(" + regexStr + ")";
|
||||
}
|
||||
if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) {
|
||||
throw new IllegalArgumentException("regex must have capture group 1");
|
||||
}
|
||||
*/
|
||||
|
||||
// Try to fix: Only check if there exists the valid left parenthesis, leave regexp validation for Pattern
|
||||
if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\\\\\(") ==
|
||||
StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\\\\\(?:")) {
|
||||
regexStr = "(" + regexStr + ")";
|
||||
}
|
||||
|
||||
this.regexStr = regexStr;
|
||||
try {
|
||||
regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) {
|
||||
throw new IllegalArgumentException("invalid regex", e);
|
||||
}
|
||||
this.group = group;
|
||||
}
|
||||
|
||||
public RegexSelector(String regexStr) {
|
||||
this(regexStr, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
return selectGroup(text).get(group);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> selectList(String text) {
|
||||
List<String> strings = new ArrayList<String>();
|
||||
List<RegexResult> results = selectGroupList(text);
|
||||
for (RegexResult result : results) {
|
||||
strings.add(result.get(group));
|
||||
}
|
||||
return strings;
|
||||
}
|
||||
|
||||
public RegexResult selectGroup(String text) {
|
||||
Matcher matcher = regex.matcher(text);
|
||||
if (matcher.find()) {
|
||||
String[] groups = new String[matcher.groupCount() + 1];
|
||||
for (int i = 0; i < groups.length; i++) {
|
||||
groups[i] = matcher.group(i);
|
||||
}
|
||||
return new RegexResult(groups);
|
||||
}
|
||||
return RegexResult.EMPTY_RESULT;
|
||||
}
|
||||
|
||||
public List<RegexResult> selectGroupList(String text) {
|
||||
Matcher matcher = regex.matcher(text);
|
||||
List<RegexResult> resultList = new ArrayList<RegexResult>();
|
||||
while (matcher.find()) {
|
||||
String[] groups = new String[matcher.groupCount() + 1];
|
||||
for (int i = 0; i < groups.length; i++) {
|
||||
groups[i] = matcher.group(i);
|
||||
}
|
||||
resultList.add(new RegexResult(groups));
|
||||
}
|
||||
return resultList;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return regexStr;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue