Tôi không biết cách nào tiêu chuẩn cho điều này, tôi đã sử dụng giải pháp similair như những gì bạn đang đề cập đến. Không chắc chắn của một người tốt hơn, vì vậy ở đây bạn có nó:
public class TextUtils {
private static final Pattern DIACRITICS_AND_FRIENDS =
Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
private static final Transliterator TO_LATIN_TRANSLITERATOR = Transliterator.getInstance("Any-Latin");
private static final Pattern EEQUIVALENTS = Pattern.compile("[ǝƏ]+");
private static final Pattern IEQUIVALENTS = Pattern.compile("[ı]+");
private static final Pattern DEQUIVALENTS = Pattern.compile("[Ððđ]+");
private static final Pattern OEQUIVALENTS = Pattern.compile("[Øø]+");
private static final Pattern LEQUIVALENTS = Pattern.compile("[Ł]+");
//all spaces, non-ascii and punctuation characters except _ and -
private static final Pattern CRAP = Pattern.compile("[\\p{IsSpace}\\P{IsASCII}\\p{IsP}\\+&&[^_]]");
private static final Pattern SEPARATORS = Pattern.compile("[\\p{IsSpace}/`-]");
private static final Pattern URLFRIENDLY = Pattern.compile("([a-zA-Z0-9_])*");
private static final CharsetEncoder ASCII_ENCODER = Charset.forName("ISO-8859-1").newEncoder();
/**
* Returns true when the input test contains only characters from the ASCII set, false otherwise.
*/
public static boolean isPureAscii(String text) {
return ASCII_ENCODER.canEncode(text);
}
/**
* Replaces all characters that normalize into two characters with their base symbol (e.g. ü -> u)
*/
public static String replaceCombiningDiacriticalMarks(String text) {
return DIACRITICS_AND_FRIENDS.matcher(Normalizer.normalize(text, Normalizer.Form.NFKD)).replaceAll("");
}
/**
* Turns the input string into a url friendly variant (containing only alphanumeric characters and '-' and '_').
* If the input string cannot be converted an IllegalArgumentException is thrown.
*/
public static String urlFriendlyStrict(String unfriendlyString) throws IllegalArgumentException {
String friendlyString =
urlFriendly(unfriendlyString);
//Assert can be removed to improve performance
Assert.isTrue(URLFRIENDLY.matcher(friendlyString).matches(),
format("Friendly string [%s] based on [%s] is not friendly enough", friendlyString, unfriendlyString));
return friendlyString;
}
/**
* Turns the input string into a url friendly variant (containing only alphanumeric characters and '-' and '_').
* Use {@link #urlFriendlyStrict(String)} to avoid potential bugs in this code.
*/
private static String urlFriendly(String unfriendlyString) {
return removeCrappyCharacters(
replaceEquivalentsOfSymbols(
replaceCombiningDiacriticalMarks(
transLiterateSymbols(
replaceSeparatorsWithUnderscores(
unfriendlyString.trim()))))).toLowerCase();
}
private static String transLiterateSymbols(String incomprehensibleString) {
String latin = TO_LATIN_TRANSLITERATOR.transform(incomprehensibleString);
return latin;
}
private static String replaceEquivalentsOfSymbols(String unfriendlyString) {
return
LEQUIVALENTS.matcher(
OEQUIVALENTS.matcher(
DEQUIVALENTS.matcher(
IEQUIVALENTS.matcher(
EEQUIVALENTS.matcher(unfriendlyString).replaceAll("e"))
.replaceAll("i"))
.replaceAll("d"))
.replaceAll("o"))
.replaceAll("l");
}
private static String removeCrappyCharacters(String unfriendlyString) {
return CRAP.matcher(unfriendlyString).replaceAll("");
}
private static String replaceSeparatorsWithUnderscores(String unfriendlyString) {
return SEPARATORS.matcher(unfriendlyString).replaceAll("_");
}
}
Câu hỏi này có một cách regex dựa để làm điều đó trong PHP: http://stackoverflow.com/questions/2580581/best-way-to-escape-and-create-a-slug –