From 8c0c3ed14cb3f33cae956520902c3c6d1812e1fa Mon Sep 17 00:00:00 2001 From: Someon1e <142684596+Someon1e@users.noreply.github.com> Date: Sun, 3 Mar 2024 20:54:53 +0000 Subject: [PATCH 1/8] Faster popcnt_i32 --- codegen/luau/runtime/runtime.lua | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/codegen/luau/runtime/runtime.lua b/codegen/luau/runtime/runtime.lua index 6ac37d3..0e99f3f 100644 --- a/codegen/luau/runtime/runtime.lua +++ b/codegen/luau/runtime/runtime.lua @@ -182,14 +182,12 @@ do local bit_countrz = bit32.countrz local function popcnt_i32(num) - local count = 0 - - while num ~= 0 do - num = bit_and(num, num - 1) - count = count + 1 - end - - return count + num = bit_and(num, 0x55555555) + bit_and(bit_rshift(num, 1), 0x55555555) + num = bit_and(num, 0x33333333) + bit_and(bit_rshift(num, 2), 0x33333333) + num = bit_and(num, 0x0f0f0f0f) + bit_and(bit_rshift(num, 4), 0x0f0f0f0f) + num = bit_and(num, 0x00ff00ff) + bit_and(bit_rshift(num, 8), 0x00ff00ff) + num = bit_and(num, 0x0000ffff) + bit_and(bit_rshift(num, 16), 0x0000ffff) + return num end popcnt.i32 = popcnt_i32 From 5cfcb2ff890efb7ebca2f05ab57ba221723d8220 Mon Sep 17 00:00:00 2001 From: Someon1e <142684596+Someon1e@users.noreply.github.com> Date: Sun, 3 Mar 2024 20:55:04 +0000 Subject: [PATCH 2/8] Faster equality --- codegen/luau/runtime/numeric_v3.lua | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/codegen/luau/runtime/numeric_v3.lua b/codegen/luau/runtime/numeric_v3.lua index faa2ea0..ea20578 100644 --- a/codegen/luau/runtime/numeric_v3.lua +++ b/codegen/luau/runtime/numeric_v3.lua @@ -1,6 +1,8 @@ local Numeric = {} -local NUM_ZERO, NUM_ONE, NUM_SIX_FOUR +local NUM_ZERO = Vector3.zero +local NUM_ONE, NUM_SIX_FOUR + local NUM_BIT_26, NUM_BIT_52 local bit_lshift = bit32.lshift @@ -339,11 +341,11 @@ function Numeric.is_negative(value) end function Numeric.is_zero(value) - return value.X == 0 and value.Y == 0 and value.Z == 0 + return value == NUM_ZERO end function Numeric.is_equal(lhs, rhs) - return lhs.X == rhs.X and lhs.Y == rhs.Y and lhs.Z == rhs.Z + return lhs == rhs end function Numeric.is_less_unsigned(lhs, rhs) @@ -402,7 +404,6 @@ num_is_negative = Numeric.is_negative num_is_zero = Numeric.is_zero num_is_less_unsigned = Numeric.is_less_unsigned -NUM_ZERO = from_u64(0) NUM_ONE = from_u64(1) NUM_SIX_FOUR = from_u64(64) NUM_BIT_26 = from_u64(0x4000000) From b4971cc76b7c9d2fe7101edbd986d499688d4d2f Mon Sep 17 00:00:00 2001 From: Someon1e <142684596+Someon1e@users.noreply.github.com> Date: Sun, 3 Mar 2024 21:02:54 +0000 Subject: [PATCH 3/8] Optimize 2 --- codegen/luau/src/bin/wasm2luau.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/codegen/luau/src/bin/wasm2luau.rs b/codegen/luau/src/bin/wasm2luau.rs index c3c2ce9..ebbb47a 100644 --- a/codegen/luau/src/bin/wasm2luau.rs +++ b/codegen/luau/src/bin/wasm2luau.rs @@ -20,6 +20,7 @@ fn do_runtime(lock: &mut dyn Write) -> Result<()> { let runtime = codegen_luau::RUNTIME; let numeric = codegen_luau::NUMERIC; + writeln!(lock, "--!optimize 2")?; writeln!(lock, "local Integer = (function()")?; writeln!(lock, "{numeric}")?; writeln!(lock, "end)()")?; From b7cfd3d405d94a44c04abb84a536a708d066cfc3 Mon Sep 17 00:00:00 2001 From: Someon1e <142684596+Someon1e@users.noreply.github.com> Date: Sun, 3 Mar 2024 21:08:25 +0000 Subject: [PATCH 4/8] Take advantage of inlining --- codegen/luau/runtime/numeric_v3.lua | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/codegen/luau/runtime/numeric_v3.lua b/codegen/luau/runtime/numeric_v3.lua index ea20578..300d57c 100644 --- a/codegen/luau/runtime/numeric_v3.lua +++ b/codegen/luau/runtime/numeric_v3.lua @@ -20,7 +20,7 @@ local bit_replace = bit32.replace local from_u32, from_u64, into_u64 local num_subtract, num_divide_unsigned, num_negate local num_or, num_shift_left, num_shift_right_unsigned -local num_is_negative, num_is_zero, num_is_less_unsigned +local num_is_less_unsigned -- X: a[0 ..21] -- Y: a[22..31] @@ -36,6 +36,10 @@ function Numeric.from_u32(data_1, data_2) return constructor(x, y, z) end +local function num_is_zero(value) + return value == NUM_ZERO +end + local function load_d1(value) return bit_replace(bit_and(value.X, 0x3FFFFF), value.Z, 22, 10) end @@ -180,6 +184,10 @@ function Numeric.divide_unsigned(lhs, rhs) return quotient, remainder end +local function num_is_negative(value) + return value.Z >= 0x80000 +end + function Numeric.divide_signed(lhs, rhs) local left_negative = num_is_negative(lhs) local right_negative = num_is_negative(rhs) @@ -336,14 +344,6 @@ function Numeric.rotate_right(lhs, rhs) end end -function Numeric.is_negative(value) - return value.Z >= 0x80000 -end - -function Numeric.is_zero(value) - return value == NUM_ZERO -end - function Numeric.is_equal(lhs, rhs) return lhs == rhs end @@ -400,8 +400,8 @@ num_or = Numeric.bit_or num_shift_left = Numeric.shift_left num_shift_right_unsigned = Numeric.shift_right_unsigned -num_is_negative = Numeric.is_negative -num_is_zero = Numeric.is_zero +Numeric.is_negative = num_is_negative +Numeric.is_zero = num_is_zero num_is_less_unsigned = Numeric.is_less_unsigned NUM_ONE = from_u64(1) From 895c028ae81509272966c237458a1716c78321ca Mon Sep 17 00:00:00 2001 From: Someon1e <142684596+Someon1e@users.noreply.github.com> Date: Mon, 4 Mar 2024 16:01:03 +0000 Subject: [PATCH 5/8] Replace load_d1 load_d2 with into_u32 --- codegen/luau/runtime/numeric_v3.lua | 38 +++++++++++++++++------------ 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/codegen/luau/runtime/numeric_v3.lua b/codegen/luau/runtime/numeric_v3.lua index 300d57c..de5a9d7 100644 --- a/codegen/luau/runtime/numeric_v3.lua +++ b/codegen/luau/runtime/numeric_v3.lua @@ -48,21 +48,25 @@ local function load_d2(value) return bit_replace(bit_and(value.Y, 0x3FFFFF), bit_rshift(value.Z, 10), 22, 10) end -function Numeric.into_u32(value) +local function into_u32(value) return load_d1(value), load_d2(value) end +Numeric.into_u32 = into_u32 function Numeric.from_u64(value) return from_u32(bit_and(value % 0x100000000), bit_and(value / 0x100000000)) end function Numeric.into_u64(value) - return load_d1(value) + load_d2(value) * 0x100000000 + local value_1, value_2 = into_u32(value) + return value_1 + value_2 * 0x100000000 end function Numeric.add(lhs, rhs) - local data_1 = load_d1(lhs) + load_d1(rhs) - local data_2 = load_d2(lhs) + load_d2(rhs) + local lhs_1, lhs_2 = into_u32(lhs) + local rhs_1, rhs_2 = into_u32(rhs) + local data_1 = lhs_1 + rhs_1 + local data_2 = lhs_2 + rhs_2 if data_1 >= 0x100000000 then data_1 = data_1 - 0x100000000 @@ -77,8 +81,10 @@ function Numeric.add(lhs, rhs) end function Numeric.subtract(lhs, rhs) - local data_1 = load_d1(lhs) - load_d1(rhs) - local data_2 = load_d2(lhs) - load_d2(rhs) + local lhs_1, lhs_2 = into_u32(lhs) + local rhs_1, rhs_2 = into_u32(rhs) + local data_1 = lhs_1 - rhs_1 + local data_2 = lhs_2 - rhs_2 if data_1 < 0 then data_1 = data_1 + 0x100000000 @@ -101,8 +107,8 @@ function Numeric.multiply(lhs, rhs) -- Divide each long into 4 chunks of 16 bits, and then add up 4x4 products. -- We can skip products that would overflow. - local lhs_1, lhs_2 = load_d1(lhs), load_d2(lhs) - local rhs_1, rhs_2 = load_d1(rhs), load_d2(rhs) + local lhs_1, lhs_2 = into_u32(lhs) + local rhs_1, rhs_2 = into_u32(rhs) local a48 = bit_rshift(lhs_2, 16) local a32 = bit_and(lhs_2, 0xFFFF) @@ -161,11 +167,10 @@ function Numeric.divide_unsigned(lhs, rhs) local quotient = NUM_ZERO local remainder = NUM_ZERO - local num_1, num_2 = load_d1(lhs), load_d2(lhs) + local num_1, num_2 = into_u32(lhs) for i = 63, 0, -1 do - local temp = num_shift_left(remainder, NUM_ONE) - local rem_1, rem_2 = load_d1(temp), load_d2(temp) + local rem_1, rem_2 = into_u32(num_shift_left(remainder, NUM_ONE)) if i > 31 then rem_1 = bit_or(rem_1, bit_extract(num_2, i - 32, 1)) @@ -214,8 +219,9 @@ function Numeric.divide_signed(lhs, rhs) end function Numeric.negate(value) - local data_1 = bit_not(load_d1(value)) + 1 - local data_2 = bit_not(load_d2(value)) + local value_1, value_2 = into_u32(value) + local data_1 = bit_not(value_1) + 1 + local data_2 = bit_not(value_2) if data_1 >= 0x100000000 then data_1 = data_1 - 0x100000000 @@ -268,7 +274,7 @@ function Numeric.shift_left(lhs, rhs) return lhs elseif count < 32 then local pad = 32 - count - local lhs_1, lhs_2 = load_d1(lhs), load_d2(lhs) + local lhs_1, lhs_2 = into_u32(lhs) local data_1 = bit_lshift(lhs_1, count) local data_2 = bit_replace(bit_rshift(lhs_1, pad), lhs_2, count, pad) @@ -287,7 +293,7 @@ function Numeric.shift_right_unsigned(lhs, rhs) if count == 0 then return lhs elseif count < 32 then - local lhs_1, lhs_2 = load_d1(lhs), load_d2(lhs) + local lhs_1, lhs_2 = into_u32(lhs) local data_1 = bit_replace(bit_rshift(lhs_1, count), lhs_2, 32 - count, count) local data_2 = bit_rshift(lhs_2, count) @@ -306,7 +312,7 @@ function Numeric.shift_right_signed(lhs, rhs) if count == 0 then return lhs elseif count < 32 then - local lhs_1, lhs_2 = load_d1(lhs), load_d2(lhs) + local lhs_1, lhs_2 = into_u32(lhs) local data_1 = bit_replace(bit_rshift(lhs_1, count), lhs_2, 32 - count, count) local data_2 = bit_arshift(lhs_2, count) From d11dc2d58292ddc3a51172aeb3e505777cf4bcd0 Mon Sep 17 00:00:00 2001 From: Someon1e <142684596+Someon1e@users.noreply.github.com> Date: Mon, 4 Mar 2024 16:07:20 +0000 Subject: [PATCH 6/8] Cache x, y, z in into_u32 --- codegen/luau/runtime/numeric_v3.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/codegen/luau/runtime/numeric_v3.lua b/codegen/luau/runtime/numeric_v3.lua index de5a9d7..f9f93f2 100644 --- a/codegen/luau/runtime/numeric_v3.lua +++ b/codegen/luau/runtime/numeric_v3.lua @@ -49,7 +49,8 @@ local function load_d2(value) end local function into_u32(value) - return load_d1(value), load_d2(value) + local x, y, z = value.X, value.Y, value.Z + return bit_replace(bit_and(x, 0x3FFFFF), z, 22, 10), bit_replace(bit_and(y, 0x3FFFFF), bit_rshift(z, 10), 22, 10) end Numeric.into_u32 = into_u32 From 3157f427d12b4296fefcaad64322fb00a5f94d0b Mon Sep 17 00:00:00 2001 From: Someon1e <142684596+Someon1e@users.noreply.github.com> Date: Mon, 4 Mar 2024 16:33:01 +0000 Subject: [PATCH 7/8] Even faster popcnt_i32 --- codegen/luau/runtime/runtime.lua | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/codegen/luau/runtime/runtime.lua b/codegen/luau/runtime/runtime.lua index 0e99f3f..9b8fc3c 100644 --- a/codegen/luau/runtime/runtime.lua +++ b/codegen/luau/runtime/runtime.lua @@ -182,12 +182,12 @@ do local bit_countrz = bit32.countrz local function popcnt_i32(num) - num = bit_and(num, 0x55555555) + bit_and(bit_rshift(num, 1), 0x55555555) - num = bit_and(num, 0x33333333) + bit_and(bit_rshift(num, 2), 0x33333333) - num = bit_and(num, 0x0f0f0f0f) + bit_and(bit_rshift(num, 4), 0x0f0f0f0f) - num = bit_and(num, 0x00ff00ff) + bit_and(bit_rshift(num, 8), 0x00ff00ff) - num = bit_and(num, 0x0000ffff) + bit_and(bit_rshift(num, 16), 0x0000ffff) - return num + num = num - bit_and(bit_rshift(num, 1), 0x55555555) + num = bit_and(num, 0x33333333) + bit_and(bit_rshift(num, 2), 0x33333333) + num = bit_and((num + bit_rshift(num, 4)), 0x0F0F0F0F) + num = num + bit_rshift(num, 8) + num = num + bit_rshift(num, 16) + return bit_and(num, 0x0000003F) end popcnt.i32 = popcnt_i32 From c8dcd9848eaaf1a944a63be60dd045af8aefa35f Mon Sep 17 00:00:00 2001 From: Someon1e <142684596+Someon1e@users.noreply.github.com> Date: Mon, 4 Mar 2024 17:06:22 +0000 Subject: [PATCH 8/8] Truncate with BOR instead of BAND --- codegen/luau/runtime/runtime.lua | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/codegen/luau/runtime/runtime.lua b/codegen/luau/runtime/runtime.lua index 9b8fc3c..a4fb33d 100644 --- a/codegen/luau/runtime/runtime.lua +++ b/codegen/luau/runtime/runtime.lua @@ -1,6 +1,7 @@ local module = {} local bit_and = bit32.band +local bit_or = bit32.bor local bit_xor = bit32.bxor local bit_lshift = bit32.lshift local bit_rshift = bit32.rshift @@ -44,16 +45,16 @@ do local num_divide_unsigned = Integer.divide_unsigned function add.i32(lhs, rhs) - return bit_and(lhs + rhs, 0xFFFFFFFF) + return bit_or(lhs + rhs, 0) end function sub.i32(lhs, rhs) - return bit_and(lhs - rhs, 0xFFFFFFFF) + return bit_or(lhs - rhs, 0) end function mul.i32(lhs, rhs) if (lhs + rhs) < 0x8000000 then - return bit_and(lhs * rhs, 0xFFFFFFFF) + return bit_or(lhs * rhs, 0) else local a16 = bit_rshift(lhs, 16) local a00 = bit_and(lhs, 0xFFFF) @@ -63,7 +64,7 @@ do local c00 = a00 * b00 local c16 = a16 * b00 + a00 * b16 - return bit_and(c00 + bit_lshift(c16, 16), 0xFFFFFFFF) + return bit_or(c00 + bit_lshift(c16, 16), 0) end end @@ -73,13 +74,13 @@ do lhs = to_i32(lhs) rhs = to_i32(rhs) - return bit_and(math_modf(lhs / rhs), 0xFFFFFFFF) + return bit_or(math_modf(lhs / rhs), 0) end function div.u32(lhs, rhs) assert(rhs ~= 0, "division by zero") - return bit_and(math_modf(lhs / rhs), 0xFFFFFFFF) + return bit_or(math_modf(lhs / rhs), 0) end function rem.i32(lhs, rhs) @@ -88,7 +89,7 @@ do lhs = to_i32(lhs) rhs = to_i32(rhs) - return bit_and(math_fmod(lhs, rhs), 0xFFFFFFFF) + return bit_or(math_fmod(lhs, rhs), 0) end add.i64 = Integer.add @@ -395,7 +396,7 @@ do end function truncate.i32_f32(num) - return bit_and(truncate_f64(num), 0xFFFFFFFF) + return bit_or(truncate_f64(num), 0) end truncate.i32_f64 = truncate.i32_f32 @@ -431,7 +432,7 @@ do function saturate.i32_f32(num) local temp = math_clamp(truncate_f64(num), -0x80000000, 0x7FFFFFFF) - return bit_and(temp, 0xFFFFFFFF) + return bit_or(temp, 0) end saturate.i32_f64 = saturate.i32_f32 @@ -474,7 +475,7 @@ do num = bit_and(num, 0xFF) if num >= 0x80 then - return bit_and(num - 0x100, 0xFFFFFFFF) + return bit_or(num - 0x100, 0) else return num end @@ -484,7 +485,7 @@ do num = bit_and(num, 0xFFFF) if num >= 0x8000 then - return bit_and(num - 0x10000, 0xFFFFFFFF) + return bit_or(num - 0x10000, 0) else return num end @@ -648,7 +649,7 @@ do local buffer_write_f64 = buffer.writef64 function load.i32_i8(memory, addr) - return bit_and(buffer_read_i8(memory.data, addr), 0xFFFFFFFF) + return bit_or(buffer_read_i8(memory.data, addr), 0) end function load.i32_u8(memory, addr) @@ -656,7 +657,7 @@ do end function load.i32_i16(memory, addr) - return bit_and(buffer_read_i16(memory.data, addr), 0xFFFFFFFF) + return bit_or(buffer_read_i16(memory.data, addr), 0) end function load.i32_u16(memory, addr)