Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
home:Ledest:erlang:26
erlang
1107-Optimize-the-implementation-of-the-Karatsu...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 1107-Optimize-the-implementation-of-the-Karatsuba-algorit.patch of Package erlang
From 4ca352e32e58fe746089410e0edff5b142b9118a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org> Date: Sat, 5 Aug 2023 09:08:11 +0200 Subject: [PATCH 7/7] Optimize the implementation of the Karatsuba algorithm This commit implements the Karatsuba algorithm in a way that reduces the number of additions, resulting in a measureable performance improvement for multiplication of large integers. --- erts/emulator/beam/big.c | 224 ++++++++++++------ .../test/big_SUITE_data/karatsuba.dat | 2 + 2 files changed, 159 insertions(+), 67 deletions(-) diff --git a/erts/emulator/beam/big.c b/erts/emulator/beam/big.c index e496d102a3..06dc9b0220 100644 --- a/erts/emulator/beam/big.c +++ b/erts/emulator/beam/big.c @@ -768,22 +768,27 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y, /* Use the Karatsuba algorithm. */ Eterm *heap; Uint temp_heap_size; - Uint z0_len, z1_len, z2_len, sum0_len, sum1_len, res_len; + Uint z0_len, z1_len, z2_len, tmp_len, diff0_len, diff1_len, res_len; Uint low_x_len, low_y_len, high_x_len, high_y_len; - Eterm *z0_buf, *z1_buf, *z2_buf, *z_res_buf; - Eterm *sum0_buf, *sum1_buf; + Eterm *z0_buf, *z1_buf, *z2_buf, *tmp_buf; + Eterm *diff0_buf, *diff1_buf; #ifdef DEBUG - Eterm *sum_buf_end, *z_buf_end; + Eterm *alloc_end; #endif Eterm *low_x, *low_y, *high_x, *high_y; ErtsDigit zero = 0; Uint m = (xl+1) / 2; + int tmp_prod_negative = 0; + int i; /* Set up pointers and sizes. */ low_x = x; low_x_len = m; high_x = x + m; high_x_len = xl - m; + while (low_x_len > 1 && low_x[low_x_len-1] == 0) { + low_x_len--; + } low_y = y; if (yl <= m) { @@ -796,45 +801,49 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y, high_y = y + m; high_y_len = yl - m; } + while (low_y_len > 1 && low_y[low_y_len-1] == 0) { + low_y_len--; + } ASSERT(low_x_len <= m); ASSERT(high_x_len <= m); ASSERT(low_y_len <= m); ASSERT(high_y_len <= m); - /* Set up buffers for the sums for z1 in the result area. */ - sum0_buf = r; - sum1_buf = r + m + 1; - + /* + * Set up temporary buffers in allocated memory. + * + * z1_buf is not used at the same time as diff0_buf + * and diff1_buf, so they can share memory. + */ + temp_heap_size = (4*m + 1) * sizeof(Eterm); #ifdef DEBUG - sum_buf_end = sum1_buf + m + 1; - ASSERT(sum_buf_end - sum0_buf + 1 <= xl + yl); - sum1_buf[0] = ERTS_HOLE_MARKER; - sum_buf_end[0] = ERTS_HOLE_MARKER; + temp_heap_size += sizeof(Eterm); #endif - - /* Set up temporary buffers in the allocated memory. */ - temp_heap_size = (3*(2*m+2) + (xl+yl+1) + 1) * sizeof(Eterm); heap = (Eterm *) erts_alloc(ERTS_ALC_T_TMP, temp_heap_size); - z0_buf = heap; - z1_buf = z0_buf + 2*m + 2; - z2_buf = z1_buf + 2*m + 2; - z_res_buf = z2_buf + 2*m + 2; -#ifdef DEBUG - z_buf_end = z_res_buf + xl+yl+1; -#endif + z1_buf = heap; + diff0_buf = z1_buf + 1; + diff1_buf = diff0_buf + m; + tmp_buf = diff1_buf + m; #ifdef DEBUG z1_buf[0] = ERTS_HOLE_MARKER; - z2_buf[0] = ERTS_HOLE_MARKER; - z_res_buf[0] = ERTS_HOLE_MARKER; - z_buf_end[0] = ERTS_HOLE_MARKER; + diff0_buf[0] = ERTS_HOLE_MARKER; + diff1_buf[0] = ERTS_HOLE_MARKER; + tmp_buf[0] = ERTS_HOLE_MARKER; + + alloc_end = tmp_buf + 2*m; + alloc_end[0] = ERTS_HOLE_MARKER; + ASSERT(alloc_end - heap + 1 == temp_heap_size / sizeof(Eterm)); #endif - /* z0 = low_x * low_y */ - z0_len = I_mul_karatsuba(low_x, low_x_len, low_y, low_y_len, z0_buf); + /* Set up pointers for the result. */ + z0_buf = r; + z2_buf = r + 2*m; - ASSERT(z1_buf[0] == ERTS_HOLE_MARKER); +#ifdef DEBUG + z2_buf[0] = ERTS_HOLE_MARKER; +#endif #define I_OPERATION(_result, _op, _p1, _sz1, _p2, _sz2, _buf) \ do { \ @@ -846,73 +855,154 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y, } while (0) /* - * z1 = (low1 + high1) * (low2 + high2) + * The Karatsuba algorithm is a divide and conquer algorithm + * for multi-word integer multiplication. The numbers to be + * multiplied are split up like this: + * + * high low + * +--------+--------+ + * | high_x | low_x | + * +--------+--------+ + * + * +--------+--------+ + * | high_y | low_y | + * +--------+--------+ + * + * Then the following values are calculated: + * + * z0 = low_x * low_y + * z2 = high_x + high_y + * z1 = (low_x - high_x) * (high_y - low_y) + z2 + z0 + * + * Note that this expression for z1 produces the same result + * as: + * + * low_x * high_y + high_x * low_y + * + * Finally, the z2, z1, z0 values are combined to form the + * product of x and y: + * + * high low + * +--+--+ +--+--+ + * | z2 | | z0 | + * +--+--+ +--+--+ + * +--+--+ + * add | z1 | + * +--+--+ + * + * There is an alternate way to calculate z1 (commonly found + * in descriptions of the Karatsuba algorithm); + * + * z1 = (high_x + low_x) * (high_y + low_y) - z2 - z0 + * + * But this way can lead to more additions and carry handling. */ - I_OPERATION(sum0_len, I_add, low_x, low_x_len, high_x, high_x_len, sum0_buf); - ASSERT(sum1_buf[0] == ERTS_HOLE_MARKER); - - I_OPERATION(sum1_len, I_add, low_y, low_y_len, high_y, high_y_len, sum1_buf); - ASSERT(sum_buf_end[0] == ERTS_HOLE_MARKER); - I_OPERATION(z1_len, I_mul_karatsuba, sum0_buf, sum0_len, sum1_buf, sum1_len, z1_buf); + /* + * z0 = low_x * low_y + * + * Store this product in its final location in the result buffer. + */ + I_OPERATION(z0_len, I_mul_karatsuba, low_x, low_x_len, low_y, low_y_len, z0_buf); ASSERT(z2_buf[0] == ERTS_HOLE_MARKER); + for (i = z0_len; i < 2*m; i++) { + z0_buf[i] = 0; + } + while (z0_len > 1 && z0_buf[z0_len - 1] == 0) { + z0_len--; + } + ASSERT(z0_len == 1 || z0_buf[z0_len-1] != 0); + ASSERT(z0_len <= low_x_len + low_y_len); /* * z2 = high_x * high_y + * + * Store this product in its final location in the result buffer. */ - if (high_y != &zero) { - I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len, high_y, high_y_len, z2_buf); + I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len, + high_y, high_y_len, z2_buf); + while (z2_len > 1 && z2_buf[z2_len - 1] == 0) { + z2_len--; + } + ASSERT(z2_len == 1 || z2_buf[z2_len-1] != 0); } else { z2_buf[0] = 0; z2_len = 1; } - ASSERT(z_res_buf[0] == ERTS_HOLE_MARKER); + ASSERT(z2_len <= high_x_len + high_y_len); /* - * z0 + (z1 × base ^ m) + (z2 × base ^ (m × 2)) - ((z0 + z2) × base ^ m) + * tmp = abs(low_x - high_x) * abs(high_y - low_y) + * + * The absolute value of each difference will fit in m words. * - * Note that the result of expression before normalization is - * not guaranteed to fit in the result buffer provided by the - * caller (r). Therefore, we must use a temporary buffer when - * calculating it. + * Save the sign of the product so that we later can choose to + * subtract or add this value. */ - - /* Copy z0 to temporary result buffer. */ - res_len = I_add(z0_buf, z0_len, &zero, 1, z_res_buf); - - while (res_len <= m) { - z_res_buf[res_len++] = 0; + if (I_comp(low_x, low_x_len, high_x, high_x_len) >= 0) { + diff0_len = I_sub(low_x, low_x_len, high_x, high_x_len, diff0_buf); + } else { + tmp_prod_negative = !tmp_prod_negative; + diff0_len = I_sub(high_x, high_x_len, low_x, low_x_len, diff0_buf); } + ASSERT(diff1_buf[0] == ERTS_HOLE_MARKER); + ASSERT(diff0_len == 1 || diff0_buf[diff0_len-1] != 0); + ASSERT(diff0_len <= m); - /* Add z1 × base ^ m */ - I_OPERATION(res_len, I_add, z_res_buf+m, res_len-m, z1_buf, z1_len, z_res_buf+m); - - while (res_len <= m) { - z_res_buf[m+res_len++] = 0; + if (x == y) { + ASSERT(xl == yl); + tmp_prod_negative = 1; + diff1_buf = diff0_buf; + diff1_len = diff0_len; + } else if (I_comp(high_y, high_y_len, low_y, low_y_len) >= 0) { + diff1_len = I_sub(high_y, high_y_len, low_y, low_y_len, diff1_buf); + } else { + tmp_prod_negative = !tmp_prod_negative; + if (high_y != &zero) { + diff1_len = I_sub(low_y, low_y_len, high_y, high_y_len, diff1_buf); + } else { + diff1_buf = low_y; + diff1_len = low_y_len; + } } + ASSERT(tmp_buf[0] == ERTS_HOLE_MARKER); + ASSERT(diff1_len == 1 || diff1_buf[diff1_len-1] != 0); + ASSERT(diff1_len <= m); + + I_OPERATION(tmp_len, I_mul_karatsuba, diff0_buf, diff0_len, diff1_buf, diff1_len, tmp_buf); + ASSERT(alloc_end[0] == ERTS_HOLE_MARKER); + while (tmp_len > 1 && tmp_buf[tmp_len - 1] == 0) { + tmp_len--; + } + ASSERT(tmp_len == 1 || tmp_buf[tmp_len-1] != 0); + ASSERT(tmp_len <= diff0_len + diff1_len); - /* Add z2 × base ^ (m × 2) */ - I_OPERATION(res_len, I_add, z_res_buf+2*m, res_len-m, z2_buf, z2_len, z_res_buf+2*m); - - /* Calculate z0 + z2 */ - I_OPERATION(z0_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z0_buf); + /* + * z1 = z0 + z2 + */ + I_OPERATION(z1_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z1_buf); + ASSERT(z1_len == 1 || z1_buf[z1_len-1] != 0); - /* Subtract (z0 + z2) × base ^ m */ - res_len = I_sub(z_res_buf+m, res_len+m, z0_buf, z0_len, z_res_buf+m); + if (tmp_prod_negative) { + /* z1 = z1 - tmp */ + z1_len = I_sub(z1_buf, z1_len, tmp_buf, tmp_len, z1_buf); + } else { + /* z1 = z1 + tmp */ + I_OPERATION(z1_len, I_add, z1_buf, z1_len, tmp_buf, tmp_len, z1_buf); + } - ASSERT(z_buf_end[0] == ERTS_HOLE_MARKER); + /* Add z1 shifted into the result */ + I_OPERATION(res_len, I_add, z0_buf+m, z2_len+m, z1_buf, z1_len, z0_buf+m); /* Normalize */ - while (z_res_buf[m + res_len - 1] == 0 && res_len > 0) { + res_len += m; + while (res_len > 1 && r[res_len - 1] == 0) { res_len--; } - res_len += m; + ASSERT(res_len == 1 || r[res_len-1] != 0); ASSERT(res_len <= xl + yl); - /* Copy result to the final result buffer. */ - (void) I_add(z_res_buf, res_len, &zero, 1, r); - erts_free(ERTS_ALC_T_TMP, (void *) heap); return res_len; } diff --git a/erts/emulator/test/big_SUITE_data/karatsuba.dat b/erts/emulator/test/big_SUITE_data/karatsuba.dat index c0a0a72647..d3eeb1edda 100644 --- a/erts/emulator/test/big_SUITE_data/karatsuba.dat +++ b/erts/emulator/test/big_SUITE_data/karatsuba.dat @@ -2,3 +2,5 @@ 778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041. 2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260. 111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111063449349767560406261235142392341647649025372723765646039413496854749810861561328494820951780143140256625626909176959305680458786553388553625107116000046915830257816726584619722724548281507789670906469943008403108740971884016290536011259624824529432103615715080701414809437629318331795425002605292373621627076391706037344544674952930655508373167888426163639927117607800531055216745789633238812571951209824449373098738720474325546029838911536645854016658738287810538248535167943048401518088275884360994522447081283976255652056784266085927715913003002338073550339930729586019785550085812920084971952784494047195550542735273493532002238413738149969014369426810026122179115108673871125270161926777717396432145405190153796887093089464334888549712739386389592196847353891969079332035195409407770223110583450176399233173852721971468580226906388907065682291829282704839192753805483705821353760692262887904886094845663298931749956912234282496355841821173993089465864518791095006027639953496042767023689710197973028632817727184498866638467324567019639370889678900544190692386873018896672246736368046857567131910710307733567633067262431422838734848 = 227244752509208666244300049023218501664936565624639514576438496566054518678090377206901554539288856996266272083253480866328891556584179795376043393179397439111001075307836347541226766323267029349186809052518632576630669501495499405707466858562586370759397778341785328471459982484138277798132149400741413820728000336730772425983653136275550466388207243285757977918 * 488949073121539043178895838851406898867685260673611653386208527301320256750945098442460084709113182789107256214464892052884956575049058585335894004824225556978848052700870382486432811126512032810019093103514678580988403100655092773497194921652262878660895819095201202957727846410635903499346091564032100080922029110666198856874372068909543752725127446446275130141704384403153173213642973260347547120927518593494548154693042687813126757638259919480596660015785042800818110628659177941014457037586017066097793890372685757581701160719984224943404424801688527463891923678834379546779645545606419569427574824563111913480237499286872496822079247464512671595600027881545516790726794520875456846025695904828856824122680691695622222780388482248652558345646407544371724549307907546148267185543556530668197490478774267191002528938567372851525085232721844598755176527243119768654561142351728852269053413174476938571816914419435443895662361267309023154705765158795604804766486473731465844362909575530822196003981963565054860681271890620405191718478159479477685388913902388171544282368568608479247952664643776459165494556000797676507398085908822491136. +16#aeb17ba36a5a62ac6f0aad2b264d0787363825b9f0edf1ddd6e3d06eb970b70c90d5a43da0e234d85a2bd692ac118318965a1fa855019b8c65f32487755dc5677e27863aa4e4a6a82a76884c4d5d78f5b7807151b0179ee3b387b2118211610d832d1e7367a0e3cd50cce3ce2810e3567fc3fddf180c5ccd0572dc0f8662ef54e864e6182c3f951deff6d4a6cead4322e9bf3d55276f9dbdab649fa18fbdeaa89c002e037bb9090b1a5907ab6d18de09f8f376efdc0341ae360aa732405bf83cfe8342d644443208cfb8ef0568cd597de1ce7389878e48863bf0ebf1538ce2c317d8ac9f81976ae51617d7f6939582a8c28375caab30052d8ddf1b2995fb3891ea4541ef3d92bff37b6726052e8d7530b1f64a3cdfbba9cc320b55b2504417ff21986ceaaab8d4f73fafca6076e04fda786562571c5482b1f06b9b2762f51f3c1734284916153b377f147feb9ab398cf9ee46ba272c0ec8685f5a3832ff4e32aca370591f68bf38523839bd7367ebe02170150e87c69c3ff0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 = 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d3734c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 * 16#aeb17ba36a5a62ace6406c497e8f672a2868e5136afa7da54eeb841bd62cc3c37a19b21ca3421a2bcb9adc043a78dc70d23fcb0bf761ee620df5f22122beadfc580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c8cb370962d3b988382e6491831cb48fa4ddf32deb2fdbca9e64763a862beef086fb51183fb9e4af5b71ca36ee3159551bcbfdf55a685e238faef19254d9350dcd811af2dbf8859e4ce82bdb8632091e0100000000000000000000000000000000000000000000000000000000000000000000000000000000. +16#34c8f69d2c4677c6d19b6e7ce34b705bd0be4db83a7e980e81ca31c352a076a32d17ccd3b115ce49dd214d2da4d36ea7ae1bbcc23ae3f69c1ca949af6143cea35124d82ffedc501525ca169af0b58ffb580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c9 = 16#34c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 - 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d37. -- 2.35.3
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor