Kod: Zaznacz cały
//typedef uint64_t limb_t;
typedef uint8_t limb_t;
const int limbBits = sizeof(limb_t)*8;
const int hlimbBits = limbBits / 2;
const limb_t hlimbBit = 1 << hlimbBits;
const limb_t hlimbMask = (1 << hlimbBits) - 1;
void mulTwoLimbs(limb_t a, limb_t b, limb_t &p, limb_t &c)
{
limb_t carry;
limb_t al = a & hlimbMask;
limb_t ah = a >> hlimbBits;
limb_t bl = b & hlimbMask;
limb_t bh = b >> hlimbBits;
limb_t p0 = al*bl;
carry = p0 >> hlimbBits;
p0 &= hlimbMask;
limb_t p1 = al*bh+carry;
limb_t c0 = p1 >> hlimbBits;
p1 &= hlimbMask;
p1 += ah*bl;
carry = p1 >> hlimbBits;
p1 &= hlimbMask;
c0 += ah*bh + carry;
carry = c0 >> hlimbBits;
c0 &= hlimbMask;
p = p0 + (p1 << hlimbBits);
c = c0 + (carry << hlimbBits);
}
Wygląda to tak: mnożę dwie cyfry A i B przez C i D, BA - złączenie, B.A - mnożenie
Kod: Zaznacz cały
BA
DC
------------
A.C
A.D
B.C
B.D
-----------
Kod: Zaznacz cały
void mulTwoLimbsGMP(limb_t a, limb_t b, limb_t &p, limb_t &c)
{
limb_t carry;
limb_t A = a & hlimbMask;
limb_t B = a >> hlimbBits;
limb_t C = b & hlimbMask;
limb_t D = b >> hlimbBits;
limb_t AC = A * C;
limb_t AD = A * D;
limb_t BC = B * C;
limb_t BD = B * D;
limb_t AC_AD_BC = AD + (AC >> hlimbBits) + BC;
if (AC_AD_BC < BC) //carry
BD += hlimbBit;
c = BD + (AC_AD_BC >> hlimbBits);
p = (AC_AD_BC << hlimbBits) + (AC & hlimbMask);
}
Ciekawe że if działa bardzo szybko, myślałem że rozgałęzienia najbardziej zwalniają, ale w nowym procesorze Intel i3 jakoś poradzili sobie z tym.