1
常见字符串编码
●
2
编码转换性能
●
static int encodeUTF8(char[] utf16, int off, int len, byte[] dest, int dp) { int sl = off + len, last_offset = sl - 1;
while (off < sl) { char c = utf16[off++]; if (c < 0x80) { // Have at most seven bits dest[dp++] = (byte) c; } else if (c < 0x800) { // 2 dest, 11 bits dest[dp++] = (byte) (0xc0 | (c >> 6)); dest[dp++] = (byte) (0x80 | (c & 0x3f)); } else if (c >= '\uD800' && c < '\uE000') { int uc; if (c < '\uDC00') { if (off > last_offset) { dest[dp++] = (byte) '?'; return dp; }
char d = utf16[off]; if (d >= '\uDC00' && d < '\uE000') { uc = (c << 10) + d + 0xfca02400; } else { throw new RuntimeException("encodeUTF8 error", new MalformedInputException(1)); } } else { uc = c; } dest[dp++] = (byte) (0xf0 | ((uc >> 18))); dest[dp++] = (byte) (0x80 | ((uc >> 12) & 0x3f)); dest[dp++] = (byte) (0x80 | ((uc >> 6) & 0x3f)); dest[dp++] = (byte) (0x80 | (uc & 0x3f)); off++; // 2 utf16 } else { // 3 dest, 16 bits dest[dp++] = (byte) (0xe0 | ((c >> 12))); dest[dp++] = (byte) (0x80 | ((c >> 6) & 0x3f)); dest[dp++] = (byte) (0x80 | (c & 0x3f)); } } return dp;}
static int writeUtf16LE(char[] chars, int off, int len, byte[] dest, final int dp) { UNSAFE.copyMemory(chars , CHAR_ARRAY_BASE_OFFSET + off * 2 , dest , BYTE_ARRAY_BASE_OFFSET + dp , len * 2 ); dp += len * 2; return dp;}
3
Java String的编码
●
static class String { final char[] value; final int offset; final int count;}
static class String { final char[] value;}
static class String { final byte code; final byte[] value;
static final byte LATIN1 = 0; static final byte UTF16 = 1;}
4
快速构造字符串的方法
●
比如如下是JDK8的String的一个构造函数的实现
public final class String { public String(char value[]) { this.value = Arrays.copyOf(value, value.length); }}
在JDK8中,有一个构造函数是不做拷贝的,但这个方法不是public,需要用一个技巧实现MethodHandles.Lookup & LambdaMetafactory绑定反射来调用,文章后面有介绍这个技巧的代码。
public final class String { String(char[] value, boolean share) { // assert share : "unshared not supported"; this.value = value; }}
Benchmark Mode Cnt Score Error UnitsStringCreateBenchmark.invoke thrpt 5 784869.350 ± 1936.754 ops/msStringCreateBenchmark.langAccess thrpt 5 784029.186 ± 2734.300 ops/msStringCreateBenchmark.unsafe thrpt 5 761176.319 ± 11914.549 ops/msStringCreateBenchmark.newString thrpt 5 140883.533 ± 2217.773 ops/ms
public static BiFunction<char[], Boolean, String> getStringCreatorJDK8() throws Throwable {Constructor<MethodHandles.Lookup> constructor = MethodHandles.Lookup.class.getDeclaredConstructor(Class.class, int.class);constructor.setAccessible(true);MethodHandles lookup = constructor.newInstance(String.class, -1 // Lookup.TRUSTED);MethodHandles.Lookup caller = lookup.in(String.class);MethodHandle handle = caller.findConstructor(String.class, MethodType.methodType(void.class, char[].class, boolean.class));CallSite callSite = LambdaMetafactory.metafactory(caller, "apply", MethodType.methodType(BiFunction.class), handle.type().generic(), handle, handle.type());return (BiFunction) callSite.getTarget().invokeExact();}
4.1.2 JDK 11快速构造字符串的方法
public static ToIntFunction<String> getStringCode11() throws Throwable { Constructor<MethodHandles.Lookup> constructor = MethodHandles.Lookup.class.getDeclaredConstructor(Class.class, int.class); constructor.setAccessible(true); MethodHandles.Lookup lookup = constructor.newInstance( String.class , -1 // Lookup.TRUSTED );
MethodHandles.Lookup caller = lookup.in(String.class); MethodHandle handle = caller.findVirtual( String.class, "coder", MethodType.methodType(byte.class) );
CallSite callSite = LambdaMetafactory.metafactory( caller , "applyAsInt" , MethodType.methodType(ToIntFunction.class) , MethodType.methodType(int.class, Object.class) , handle , handle.type() );
return (ToIntFunction<String>) callSite.getTarget().invokeExact();}
if (JDKUtils.JVM_VERSION == 11) { Function<byte[], String> stringCreator = JDKUtils.getStringCreatorJDK11();
byte[] bytes = new byte[]{'a', 'b', 'c'}; String apply = stringCreator.apply(bytes); assertEquals("abc", apply);}
4.1.3 JDK 17快速构造字符串的方法
--add-opens java.base/java.lang.invoke=ALL-UNNAMED
public static BiFunction<byte[], Charset, String> getStringCreatorJDK17() throws Throwable {Constructor<MethodHandles.Lookup> constructor = MethodHandles.Lookup.class.getDeclaredConstructor(Class.class, Class.class, int.class);constructor.setAccessible(true);MethodHandles.Lookup lookup = constructor.newInstance(String.class, null, -1 // Lookup.TRUSTED);MethodHandles.Lookup caller = lookup.in(String.class);MethodHandle handle = caller.findStatic(String.class, "newStringNoRepl1", MethodType.methodType(String.class, byte[].class, Charset.class));CallSite callSite = LambdaMetafactory.metafactory(caller, "apply", MethodType.methodType(BiFunction.class), handle.type().generic(), handle, handle.type());return (BiFunction<byte[], Charset, String>) callSite.getTarget().invokeExact();}
if (JDKUtils.JVM_VERSION == 17) { BiFunction<byte[], Charset, String> stringCreator = JDKUtils.getStringCreatorJDK17();
byte[] bytes = new byte[]{'a', 'b', 'c'}; String apply = stringCreator.apply(bytes, StandardCharsets.US_ASCII); assertEquals("abc", apply);}
4.2 基于JavaLangAccess快速构造
JavaLangAccess javaLangAccess = SharedSecrets.getJavaLangAccess(); StandardCharsets.US_ASCII);
4.3 基于Unsafe实现快速构造字符串
public static final Unsafe UNSAFE;static { Unsafe unsafe = null; try { Field theUnsafeField = Unsafe.class.getDeclaredField("theUnsafe"); theUnsafeField.setAccessible(true); unsafe = (Unsafe) theUnsafeField.get(null); } catch (Throwable ignored) {} UNSAFE = unsafe;}
////////////////////////////////////////////
Object str = UNSAFE.allocateInstance(String.class);UNSAFE.putObject(str, valueOffset, chars);
Object str = UNSAFE.allocateInstance(String.class);UNSAFE.putByte(str, coderOffset, (byte) 0);UNSAFE.putObject(str, valueOffset, (byte[]) bytes);
如下的方法格式化日期为字符串,性能就会非常好。
public String formatYYYYMMDD(Calendar calendar) throws Throwable {int year = calendar.get(Calendar.YEAR);int month = calendar.get(Calendar.MONTH) + 1;int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH);byte y0 = (byte) (year / 1000 + '0');byte y1 = (byte) ((year / 100) % 10 + '0');byte y2 = (byte) ((year / 10) % 10 + '0');byte y3 = (byte) (year % 10 + '0');byte m0 = (byte) (month / 10 + '0');byte m1 = (byte) (month % 10 + '0');byte d0 = (byte) (dayOfMonth / 10 + '0');byte d1 = (byte) (dayOfMonth % 10 + '0');if (JDKUtils.JVM_VERSION >= 9) {byte[] bytes = new byte[] {y0, y1, y2, y3, m0, m1, d0, d1};if (JDKUtils.JVM_VERSION == 17) {return JDKUtils.getStringCreatorJDK17().apply(bytes, StandardCharsets.US_ASCII);}if (JDKUtils.JVM_VERSION <= 11) {return JDKUtils.getStringCreatorJDK11().apply(bytes);}return new String(bytes, StandardCharsets.US_ASCII);}char[] chars = new char[]{(char) y0,(char) y1,(char) y2,(char) y3,(char) m0,(char) m1,(char) d0,(char) d1};if (JDKUtils.JVM_VERSION == 8) {return JDKUtils.getStringCreatorJDK8().apply(chars, true);}return new String(chars);}
5
快速遍历字符串的办法
●
public final class String { private final char value[]; public char charAt(int index) { if ((index < 0) || (index >= value.length)) { throw new StringIndexOutOfBoundsException(index); } return value[index]; }}
public final class String { private final byte[] value; private final byte coder; public char charAt(int index) { if (isLatin1()) { return StringLatin1.charAt(value, index); } else { return StringUTF16.charAt(value, index); } }}
5.1 获取String.value的方法
Benchmark Mode Cnt Score Error UnitsStringGetValueBenchmark.reflect thrpt 5 438374.685 ± 1032.028 ops/msStringGetValueBenchmark.unsafe thrpt 5 1302654.150 ± 59169.706 ops/ms
static Field valueField;static { try { valueField = String.class.getDeclaredField("value"); valueField.setAccessible(true); } catch (NoSuchFieldException ignored) {}}
////////////////////////////////////////////
char[] chars = (char[]) valueField.get(str);
5.1.2 使用Unsafe获取String.value
static long valueFieldOffset;static { try { Field valueField = String.class.getDeclaredField("value"); valueFieldOffset = UNSAFE.objectFieldOffset(valueField); } catch (NoSuchFieldException ignored) {}}
////////////////////////////////////////////
char[] chars = (char[]) UNSAFE.getObject(str, valueFieldOffset);
static long valueFieldOffset;static long coderFieldOffset;static { try { Field valueField = String.class.getDeclaredField("value"); valueFieldOffset = UNSAFE.objectFieldOffset(valueField); Field coderField = String.class.getDeclaredField("coder"); coderFieldOffset = UNSAFE.objectFieldOffset(coderField); } catch (NoSuchFieldException ignored) {}}
////////////////////////////////////////////
byte coder = UNSAFE.getObject(str, coderFieldOffset);byte[] bytes = (byte[]) UNSAFE.getObject(str, valueFieldOffset);
6
更快的encodeUTF8方法
●
public static int encodeUTF8(char[] src, int offset, int len, byte[] dst, int dp) { int sl = offset + len; int dlASCII = dp + Math.min(len, dst.length);
// ASCII only optimized loop while (dp < dlASCII && src[offset] < '\u0080') { dst[dp++] = (byte) src[offset++]; }
while (offset < sl) { char c = src[offset++]; if (c < 0x80) { // Have at most seven bits dst[dp++] = (byte) c; } else if (c < 0x800) { // 2 bytes, 11 bits dst[dp++] = (byte) (0xc0 | (c >> 6)); dst[dp++] = (byte) (0x80 | (c & 0x3f)); } else if (c >= '\uD800' && c < ('\uDFFF' + 1)) { //Character.isSurrogate(c) but 1.7 final int uc; int ip = offset - 1; if (c >= '\uD800' && c < ('\uDBFF' + 1)) { // Character.isHighSurrogate(c) if (sl - ip < 2) { uc = -1; } else { char d = src[ip + 1]; // d >= '\uDC00' && d < ('\uDFFF' + 1) if (d >= '\uDC00' && d < ('\uDFFF' + 1)) { // Character.isLowSurrogate(d) uc = ((c << 10) + d) + (0x010000 - ('\uD800' << 10) - '\uDC00'); // Character.toCodePoint(c, d) } else { dst[dp++] = (byte) '?'; continue; } } } else { // if (c >= '\uDC00' && c < ('\uDFFF' + 1)) { // Character.isLowSurrogate(c) dst[dp++] = (byte) '?'; continue; } else { uc = c; } }
if (uc < 0) { dst[dp++] = (byte) '?'; } else { dst[dp++] = (byte) (0xf0 | ((uc >> 18))); dst[dp++] = (byte) (0x80 | ((uc >> 12) & 0x3f)); dst[dp++] = (byte) (0x80 | ((uc >> 6) & 0x3f)); dst[dp++] = (byte) (0x80 | (uc & 0x3f)); offset++; // 2 chars } } else { // 3 bytes, 16 bits dst[dp++] = (byte) (0xe0 | ((c >> 12))); dst[dp++] = (byte) (0x80 | ((c >> 6) & 0x3f)); dst[dp++] = (byte) (0x80 | (c & 0x3f)); } } return dp;}
char[] chars = UNSAFE.getObject(str, valueFieldOffset);// ensureCapacity(chars.length * 3)byte[] bytes = ...; // int bytesLength = IOUtils.encodeUTF8(chars, 0, chars.length, bytes, bytesOffset);
public class EncodeUTF8Benchmark { static String STR = "01234567890ABCDEFGHIJKLMNOPQRSTUVWZYZabcdefghijklmnopqrstuvwzyz一二三四五六七八九十"; static byte[] out;
static long valueFieldOffset;
static { out = new byte[STR.length() * 3]; try { Field valueField = String.class.getDeclaredField("value"); valueFieldOffset = UnsafeUtils.UNSAFE.objectFieldOffset(valueField); } catch (NoSuchFieldException e) { e.printStackTrace(); } }
public void unsafeEncodeUTF8() throws Exception { char[] chars = (char[]) UnsafeUtils.UNSAFE.getObject(STR, valueFieldOffset); int len = IOUtils.encodeUTF8(chars, 0, chars.length, out, 0); }
public void getBytesUTF8() throws Exception { byte[] bytes = STR.getBytes(StandardCharsets.UTF_8); System.arraycopy(bytes, 0, out, 0, bytes.length); }
public static void main(String[] args) throws RunnerException { Options options = new OptionsBuilder() .include(EncodeUTF8Benchmark.class.getName()) .mode(Mode.Throughput) .timeUnit(TimeUnit.MILLISECONDS) .forks(1) .build(); new Runner(options).run(); }}
EncodeUTF8Benchmark.getBytesUTF8 thrpt 5 20690.960 ± 5431.442 ops/msEncodeUTF8Benchmark.unsafeEncodeUTF8 thrpt 5 34508.606 ± 55.510 ops/ms
public static int encodeUTF8(byte[] src, int offset, int len, byte[] dst, int dp) { int sl = offset + len; while (offset < sl) { byte b0 = src[offset++]; byte b1 = src[offset++];
if (b1 == 0 && b0 >= 0) { dst[dp++] = b0; } else { char c = (char)(((b0 & 0xff) << 0) | ((b1 & 0xff) << 8)); if (c < 0x800) { // 2 bytes, 11 bits dst[dp++] = (byte) (0xc0 | (c >> 6)); dst[dp++] = (byte) (0x80 | (c & 0x3f)); } else if (c >= '\uD800' && c < ('\uDFFF' + 1)) { //Character.isSurrogate(c) but 1.7 final int uc; int ip = offset - 1; if (c >= '\uD800' && c < ('\uDBFF' + 1)) { // Character.isHighSurrogate(c) if (sl - ip < 2) { uc = -1; } else { b0 = src[ip + 1]; b1 = src[ip + 2]; char d = (char) (((b0 & 0xff) << 0) | ((b1 & 0xff) << 8)); // d >= '\uDC00' && d < ('\uDFFF' + 1) if (d >= '\uDC00' && d < ('\uDFFF' + 1)) { // Character.isLowSurrogate(d) uc = ((c << 10) + d) + (0x010000 - ('\uD800' << 10) - '\uDC00'); // Character.toCodePoint(c, d) } else { return -1; } } } else { // if (c >= '\uDC00' && c < ('\uDFFF' + 1)) { // Character.isLowSurrogate(c) return -1; } else { uc = c; } }
if (uc < 0) { dst[dp++] = (byte) '?'; } else { dst[dp++] = (byte) (0xf0 | ((uc >> 18))); dst[dp++] = (byte) (0x80 | ((uc >> 12) & 0x3f)); dst[dp++] = (byte) (0x80 | ((uc >> 6) & 0x3f)); dst[dp++] = (byte) (0x80 | (uc & 0x3f)); offset++; // 2 chars } } else { // 3 bytes, 16 bits dst[dp++] = (byte) (0xe0 | ((c >> 12))); dst[dp++] = (byte) (0x80 | ((c >> 6) & 0x3f)); dst[dp++] = (byte) (0x80 | (c & 0x3f)); } } } return dp;}
byte coder = UNSAFE.getObject(str, coderFieldOffset);byte[] value = UNSAFE.getObject(str, coderFieldOffset);
if (coder == 0) { // ascii arraycopy} else { // ensureCapacity(chars.length * 3) byte[] bytes = ...; // int bytesLength = IOUtils.encodeUTF8(value, 0, value.length, bytes, bytesOffset);}
7
重要提醒
●
上面这些技巧都不是给新手使用的,使用不当会容易导致BUG,如果没彻底搞懂,请不要使用!
参考链接:
[1]https://github.com/alibaba/fastjson2/blob/2.0.3/core/src/main/java/com/alibaba/fastjson2/util/IOUtils.java
[2]https://github.com/alibaba/fastjson2/blob/2.0.3/core/src/main/java/com/alibaba/fastjson2/util/JDKUtils.java
大数据学习-数学基础及应用
本课程主要介绍大数据中的数学基础:
一、向量、矩阵介绍 ;
二、向量在游戏引擎中的应用;
三、矩阵奇异值分解及其应用
四、导数、梯度介绍 ;五、最优化方法及其应用。
点击阅读原文查看详情!