How to change case with String.replaceAll()
Is it possible to use regex replacing feature (String.replaceAll or Matcher.replaceAll) with change to upper or lower case?
For example I have the input text:
-aaa- -bbb-
I want to change the hyphens to underscore, and change text inside to upper case.
The first thing is easy:
String.replace("-(.*?)-", "_$1_")
but how to change text inside $1 to upper case (or lower, if needed)?
I would like to have an option to write replacement regex like for example "_$+1_" where $+1 means "$1 with change to upper case" - but obviously there is no such option.
Obviously, I can write my own method being able to parse the syntax like this.
But perhaps there is such functionality already implemented? Do you know how to solve it?
Here you go: import java.util.regex.*;
/**
*
RegexHelper provides alternative implementations of the Matcher
* methods <tt>replaceAll</tt> and <tt>replaceFirst</tt>, with enhanced
* processing of the replacement strings. The major enhancement is that
* cpaturing-group references can contain conversion specifiers that cause
* the captured text to be processed in some way before being appended to
* the output string. Conversions can be specified by adding the appropriate
* character between the dollar sign and the digit(s); valid conversions are:
*
<ul>
*<li> <tt>U</tt> : converts all characters in the group to uppercase.</li>
*<li> <tt>L</tt> : converts all characters in the group to lowercase.</li>
*<li> <tt>T</tt> : converts the first letter of each word to uppercase
*and all other letters to lowercase.</li>
*<li> <tt>t</tt> : converts the first letter in the group to uppercase
*and all other letters to lowercase.</li>
* </ul>
*
Conversions can only be performed on captured groups, and only one at
* a time. To perform a conversion on the entire match, use (for example)
* <tt>$U0</tt>.
*
*
Another enhancement is that the whitespace escapes "\t" and "\n", if
* present in the replacement string, will be converted to TAB and LF,
* respectively. Unicode escapes of the form "\uXXXX" are also supported,
* just as they are in the <tt>regex</tt> argument, which means any Unicode
* character can be added to the output string. Other than that, backslash
* is used to indicate that the following character should not be treated
* specially: "\$" becomes '$', and "\\" becomes '\'. If you want to insert
* the contents of group 1 followed by a zero, but your regex has ten or more
* capturing groups, you would use "$1\0". If a backslash is the last character
* in the replacement string, it will be appended to the output string.
*
* @author Alan Moore
*/
public class RegexHelper
{
public static String replaceAll(String str, String regex, String repl)
{
Matcher m = Pattern.compile(regex).matcher(str);
if (m.find())
{
StringBuffer sb = new StringBuffer();
do {
m.appendReplacement(sb, "");
appendReplacement(sb, m, repl);
} while (m.find());
m.appendTail(sb);
return sb.toString();
}
return str;
}
public static String replaceFirst(String str, String regex, String repl)
{
Matcher m = Pattern.compile(regex).matcher(str);
if (m.find())
{
StringBuffer sb = new StringBuffer();
m.appendReplacement(sb, "");
appendReplacement(sb, m, repl);
m.appendTail(sb);
return sb.toString();
}
return str;
}
static void appendReplacement(StringBuffer sb, Matcher m, String repl)
{
int cursor = 0;
int len = repl.length();
while (cursor < len)
{
char nextChar = repl.charAt(cursor);
if (nextChar == '\\' && cursor < len - 1)
{
nextChar = repl.charAt(++cursor);
switch (nextChar)
{
case 't':
sb.append('\t');
break;
case 'n':
sb.append('\n');
break;
case 'u':
char ch = unicodeValue(repl, cursor + 1);
if (ch != '\u0000')
{
cursor += 4;
sb.append(ch);
}
else
{
sb.append('u');
}
break;
default:
sb.append(nextChar);
}
cursor++;
}
else if (nextChar == '$' && cursor < len - 1)
{
Conversion conversion = Conversion.NONE;
char nextNextChar = repl.charAt(++cursor);
if (nextNextChar < '0' || nextNextChar > '9')
{
if (cursor == len - 1)
{
sb.append('$').append(nextNextChar);
cursor++;
continue;
}
conversion = Conversion.valueOf(nextNextChar);
if (conversion == Conversion.NONE)
{
sb.append('$');
if (nextNextChar != '\\')
{
sb.append(nextNextChar);
cursor++;
}
continue;
}
nextChar = repl.charAt(++cursor);
}
else
{
nextChar = nextNextChar;
}
int digit = Character.digit(nextChar, 10);
if (digit == -1 || m.groupCount() < digit)
{
// It's not a (valid) group reference
sb.append('$');
if (conversion != Conversion.NONE)
{
sb.append(nextNextChar);
}
sb.append(nextChar);
cursor++;
continue;
}
int refNum = digit;
// Capture the largest legal group string
while (++cursor < len)
{
nextChar = repl.charAt(cursor);
if ((digit = Character.digit(nextChar, 10)) == -1)
{
break;
}
int newRefNum = (refNum * 10) + digit;
if (m.groupCount() < newRefNum)
{
break;
}
refNum = newRefNum;
}
// Append group if it matched anything
String group = m.group(refNum);
if (group != null && group.length() > 0)
{
sb.append(conversion.convert(group));
}
}
else
{
sb.append(nextChar);
cursor++;
}
}
}
private static char unicodeValue(String repl, int cursor)
{
int result = 0;
int end = cursor + 4;
if (end > repl.length())
{
return '\u0000';
}
for (int i = cursor; i < end; i++)
{
char next = repl.charAt(i);
int n = Character.digit(next, 16);
if (n == -1)
{
return '\u0000';
}
result = result * 16 + n;
}
return (char)result;
}
private static String titleize(String str, boolean allWords)
{
int strLen;
if (str == null || (strLen = str.length()) == 0)
{
return str;
}
StringBuilder sb = new StringBuilder(strLen);
boolean whitespace = true;
for (int i = 0; i < strLen; i++)
{
char ch = str.charAt(i);
if (Character.isWhitespace(ch))
{
sb.append(ch);
whitespace = allWords;
}
else if (whitespace && Character.isLetter(ch))
{
sb.append(Character.toTitleCase(ch));
whitespace = false;
}
else
{
sb.append(Character.toLowerCase(ch));
}
}
return sb.toString();
}
private enum Conversion
{
NONE('\0')
{
public String convert(String str) { return str; }
},
TO_UPPER_CASE('U')
{
public String convert(String str) { return str.toUpperCase(); }
},
TO_LOWER_CASE('L')
{
public String convert(String str) { return str.toLowerCase(); }
},
TO_TITLE_CASE('t')
{
public String convert(String str) { return RegexHelper.titleize(str, false); }
},
TITLE_CASE_ALL('T')
{
public String convert(String str) { return RegexHelper.titleize(str, true); }
};
public static Conversion valueOf(char ch)
{
for (Conversion conv : values())
{
if (conv.specifier == ch)
{
return conv;
}
}
return NONE;
}
private final char specifier;
private Conversion(char specifier)
{
this.specifier = specifier;
}
public abstract String convert(String str);
}
public static void main(String... args)
{
String s = "-aaa- -bbb-";
System.out.println(RegexHelper.replaceAll(s, "-(.*?)-", "_$t1_"));
System.out.println(RegexHelper.replaceFirst(s, "-(.*?)-", "_$U1_"));
}
private RegexHelper() {}
}