Java过滤Emoji字符

微信昵称支持Emoji表情,存储微信昵称时,若线上mysql编码未采用utfmb4,依然使用utf-8,向数据库写数据时就要过滤掉昵称中的Emoji表情。
过滤emoji可选用以下方法:

方法一:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
public class EmojiFilter {
/**
* 检测是否有emoji字符
* @param source
* @return 一旦含有就抛出
*/
public static boolean containsEmoji(String source)
{
if (source == null || "".equals(source))
{
return false;
}
int len = source.length();
for (int i = 0; i < len; i++ )
{
char codePoint = source.charAt(i);
if (isEmojiCharacter(codePoint))
{
// do nothing,判断到了这里表明,确认有表情字符
return true;
}
}
return false;
}
private static boolean isEmojiCharacter(char codePoint)
{
return (codePoint == 0x0) || (codePoint == 0x9) || (codePoint == 0xA)
|| (codePoint == 0xD) || ((codePoint >= 0x20) && (codePoint <= 0xD7FF))
|| ((codePoint >= 0xE000) && (codePoint <= 0xFFFD))
|| ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF));
}
/**
* 过滤emoji 或者 其他非文字类型的字符
* @param source
* @return
*/
public static String filterEmoji(String source)
{
if (!containsEmoji(source))
{
return source;// 如果不包含,直接返回
}
// 到这里铁定包含
StringBuilder buf = null;
int len = source.length();
for (int i = 0; i < len; i++ )
{
char codePoint = source.charAt(i);
if (isEmojiCharacter(codePoint))
{
if (buf == null)
{
buf = new StringBuilder(source.length());
}
buf.append(codePoint);
}
else
{}
}
if (buf == null)
{
return source;// 如果没有找到 emoji表情,则返回源字符串
}
else
{
if (buf.length() == len)
{// 这里的意义在于尽可能少的toString,因为会重新生成字符串
buf = null;
return source;
}
else
{
return buf.toString();
}
}
}
}

单元测试如下:

1
2
3
4
5
6
7
8
public void testEmojiFilter1() {
boolean expireRight = true;
String predictStr = "An awesome string with a few emojis!";
String nickname= "An 😀awesome 😃string with a few 😉emojis!";
String newNickname = EmojiFilter.filterEmoji(nickname);
System.out.println("newNickname: "+newNickname);
assertTrue("newNickname result: " + newNickname, newNickname.equals(predictStr));
}

方法二:

如果是maven项目,在pom中引入

1
2
3
4
5
<dependency>
<groupId>com.vdurmont</groupId>
<artifactId>emoji-java</artifactId>
<version>3.3.0</version>
</dependency>

使用库封装的方法:

1
String resultStr = EmojiParser.removeAllEmojis(str);

简单封装:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
public class EmojiFilterUtil {
/**
* 检测是否有emoji字符
*/
public static boolean containsEmoji(String source) {
if (source == null || "".equals(source)) {
return false;
}
if (source.length() == EmojiParser.removeAllEmojis(source).length()) {
return false;
}
return true;
}
}

EmojiParser提供emoji处理的各种方法,感兴趣自己试下。

方法三(推荐):

过滤utf8mb4中非utf8字符,不仅仅是emoji

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
public class FilterUtf8Mb4Util {
private static Logger logger = LoggerFactory.getLogger(FilterUtf8Mb4Util.class);
/**
* 过滤非汉字的utf8的字符(包括emoji)
*
* @param text 原字符串
* @return 过滤后的字符串
*/
public static String filterOffUtf8Mb4(String text) {
byte[] bytes = new byte[0];
try {
bytes = text.getBytes("utf-8");
} catch (UnsupportedEncodingException e) {
logger.error("filterOffUtf8Mb4 text to bytes, UnsupportedEncodingException: {}", e);
}
ByteBuffer buffer = ByteBuffer.allocate(bytes.length);
int i = 0;
while (i < bytes.length) {
short b = bytes[i];
if (b > 0) {
buffer.put(bytes[i++]);
continue;
}
b += 256; //去掉符号位
if (((b >> 5) ^ 0x06) == 0) {
buffer.put(bytes, i, 2);
i += 2;
System.out.println("2");
} else if (((b >> 4) ^ 0x0E) == 0) {
System.out.println("3");
buffer.put(bytes, i, 3);
i += 3;
} else if (((b >> 3) ^ 0x1E) == 0) {
i += 4;
System.out.println("4");
} else if (((b >> 2) ^ 0xBE) == 0) {
i += 5;
System.out.println("5");
} else {
i += 6;
System.out.println("6");
}
}
buffer.flip();
try {
return new String(buffer.array(), "utf-8");
} catch (UnsupportedEncodingException e) {
logger.error("filterOffUtf8Mb4 result, UnsupportedEncodingException: {}", e);
return "";
}
}
}

单元测试如下:

1
2
3
4
5
6
7
8
9
public class FilterUtf8Mb4UtilTest extends TestCase {
public void testFilterOffUtf8Mb4() throws Exception {
String illegalStr1 = "An 馃榾awesome 馃槂string with a few 馃槈emojis!";
String illegalStr2 = "\uD83C\uDFFB鎯呫偡";
assertTrue(!FilterUtf8Mb4Util.filterOffUtf8Mb4(illegalStr1).equals(illegalStr1));
assertTrue(!FilterUtf8Mb4Util.filterOffUtf8Mb4(illegalStr2).equals(illegalStr2));
}
}