开发者

optimising btwoc()

According to OpenID Authentication 2.0, section 4.2,

Arbitrary precision integers MUST be encoded as big-endian signed two's complement binary strings. Henceforth, "btwoc" is a function that takes an arbitrary precision integer and returns its shortest big-endian two's complement representation. All integers that are used with Diffie-Hellman Key Exchange are positive. This means that the left-most bit of the two's complement representation MUST be zero. If it is not, implementations MUST add a zero byte at the front of the string.

Non-normative example:

Base 10 number | btwoc string representation
---------------+----------------------------
0              | "\x00"
127            | "\x7F"
128            | "\x00\x80"
255            | "\x00\xFF"
32768          | "\x00\x80\x00"

I have tried writing my own implementation of btwoc in C, and this is what I have:

typedef struct {
    uint8_t *data;
    uintmax_t length;
} oid_data;

oid_data *oid_btwoc_r(uintmax_t value,开发者_Python百科 oid_data *ret) {
    unsigned    fnz = sizeof(uintmax_t) + 1,
                i = sizeof(uintmax_t) * 8;
    while (--fnz && (!(value >> (i -= 8) & 0xFF)));
    /*
        If `value' is non-zero, `fnz' now contains the index of the first
        non-zero byte in `value', where 1 refers to the least-significant byte.
        `fnz' will therefore be in the range [1 .. sizeof(uintmax_t)]. If
        `value' is zero, then `fnz' is zero.
    */
    if (!value) {
        /* The value is zero */
        ret->length = 1;
        ret->data[0] = 0;
    } else if (value >> ((fnz - 1) * 8 + 7)) {
        /* The most significant bit of the first non-zero byte is 1 */
        ret->length = fnz + 1;
        ret->data[0] = 0;
        for (i = 1; i <= fnz; i++)
            ret->data[1 + fnz - i] =
                value >> ((i - 1) * 8);
    } else {
        /* The most significant bit of the first non-zero byte is 0 */
        ret->length = fnz;
        for (i = 1; i <= fnz; i++)
            ret->data[fnz - i] =
                value >> ((i - 1) * 8);
    }
    return ret;
}

ret->data should point to valid memory of at least sizeof(uintmax_t) + 1 bytes.

It works fine, and I haven't discovered any bugs in the implementation yet, but can it be optimised?


If you keep zero as a special case, you should surely deal with it before the while loop. (Personal bête noire: I dislike (!value) for (value == 0).)


I've not tried timing this (yet), but this code produces the same answer as yours (at least on the values I tested; see below) and looks simpler to me, not least because it doesn't double up the code to deal with the case where the high bit is set in the most significant byte:

void oid_btwoc_r2(uintmax_t value, oid_data *ret)
{
    if (value == 0)
    {
        ret->data[0] = 0;
        ret->length  = 1;
        return;
    }

    uintmax_t v0 = value;
    uintmax_t v1 = v0;
    unsigned  n  = 0;

    while (v0 != 0)
    {
        n++;
        v1 = v0;
        v0 >>= 8;
    }
    //printf("Value: 0x%" PRIXMAX ", v1 = 0x%" PRIXMAX "\n", value, v1);

    assert(v1 < 0x100 && v1 != 0);
    if (v1 > 0x7F)
        n++;
    //printf("N = %u\n", n);

    for (unsigned i = n; i != 0; i--)
    {
        ret->data[i-1] = (value & 0xFF);
        value >>= 8;
    }
    ret->length = n;
}

The code I used to test this against your version was:

#include <assert.h>
#include <inttypes.h>
#include <stdio.h>
#include <string.h>

#define DIM(x)  (sizeof(x) / sizeof(*(x)))

static void dump_oid_data(FILE *fp, const char *tag, const oid_data *data)
{
    fprintf(fp, "%s: length %" PRIuMAX ":", tag, data->length);
    for (uintmax_t i = 0; i < data->length; i++)
        fprintf(fp, " 0x%02X", data->data[i]);
    fputc('\n', fp);
}

int main(void)
{
    uintmax_t list[] = { 0, 0x7F, 0x80, 0xFF, 0x100, 0x7FFF, 0x8000, 0xFFFF,
                         0x10000, 0x7FFFFF, 0x800000, 0xFFFFFF };

    for (size_t i = 0; i < DIM(list); i++)
    {
        uint8_t b1[sizeof(uintmax_t) + 1];
        uint8_t b2[sizeof(uintmax_t) + 1];
        oid_data v1 = { b1, sizeof(b1) };
        oid_data v2 = { b2, sizeof(b2) };
        oid_btwoc_r(list[i], &v1);
        oid_btwoc_r2(list[i], &v2);
        printf("Value: 0x%" PRIXMAX ": ", list[i]);
        if (v1.length != v2.length)
            printf("Lengths differ (%" PRIuMAX " vs %" PRIuMAX ")\n", v1.length, v2.length);
        else if (memcmp(v1.data, v2.data, v1.length) != 0)
        {
            printf("Data differs!\n");
            dump_oid_data(stdout, "oid_btwoc_r1()", &v1);
            dump_oid_data(stdout, "oid_btwoc_r2()", &v2);
        }
        else
        {
            printf("Values are the same\n");
            dump_oid_data(stdout, "oid_btwoc_rN()", &v2);
        }
    }
    return(0);
}

Yes; an early version of the code needed the dump function to see what was going wrong!

Timing

I adapted the original oid_btwoc_r() to a function returning void (no final return) and renamed it oid_btwoc_r1(). I then ran a timing test (on a Mac Mini running MacOS X Lion 10.7.1), and got the timing results:

oid_btwoc_r1(): 4.925386
oid_btwoc_r2(): 4.022604
oid_btwoc_r1(): 4.930649
oid_btwoc_r2(): 4.004344
oid_btwoc_r1(): 4.927602
oid_btwoc_r2(): 4.005756
oid_btwoc_r1(): 4.923356
oid_btwoc_r2(): 4.007910
oid_btwoc_r1(): 4.984037
oid_btwoc_r2(): 4.202986
oid_btwoc_r1(): 5.015747
oid_btwoc_r2(): 4.067265
oid_btwoc_r1(): 4.982333
oid_btwoc_r2(): 4.019807
oid_btwoc_r1(): 4.957866
oid_btwoc_r2(): 4.074712
oid_btwoc_r1(): 4.993991
oid_btwoc_r2(): 4.042422
oid_btwoc_r1(): 4.970930
oid_btwoc_r2(): 4.077203

Consequently, it appears that the oid_btwoc_r2() function is about 20% faster than the original - at least on the data tested. Bigger numbers alter the balance in favour of oid_btwoc_r(), with `oid_btwoc_r1() then being about 20% faster:

oid_btwoc_r1(): 3.671201
oid_btwoc_r2(): 4.605171
oid_btwoc_r1(): 3.669026
oid_btwoc_r2(): 4.575745
oid_btwoc_r1(): 3.673729
oid_btwoc_r2(): 4.659433
oid_btwoc_r1(): 3.684662
oid_btwoc_r2(): 4.671654
oid_btwoc_r1(): 3.730757
oid_btwoc_r2(): 4.645485
oid_btwoc_r1(): 3.764600
oid_btwoc_r2(): 4.673244
oid_btwoc_r1(): 3.669582
oid_btwoc_r2(): 4.610177
oid_btwoc_r1(): 3.664248
oid_btwoc_r2(): 4.813711
oid_btwoc_r1(): 3.675927
oid_btwoc_r2(): 4.630148
oid_btwoc_r1(): 3.681798
oid_btwoc_r2(): 4.614129

Since big numbers are probably more likely than small ones in this context, oid_btwoc_r1() - or the original oid_btwoc_r() - is arguably the better choice.

The test code follows. The uncommented for loop is the 'large number' version which shows oid_btwoc_r1() working faster than oid_btwoc_r2(); the commented out for loop is the 'small number' version which shows oid_btwoc_r2() working faster than oid_btowc_r1().

static void test_converter(const char *tag, void (*function)(uintmax_t, oid_data *))
{
    Clock clk;
    clk_init(&clk);
    clk_start(&clk);
    for (uintmax_t i = 0x100000000; i < 0x1000000000000000; i += 0x170000000)
    //for (uintmax_t i = 0; i < 0x100000000; i += 17)
    {
        uint8_t b1[sizeof(uintmax_t) + 1];
        oid_data v1 = { b1, sizeof(b1) };
        (*function)(i, &v1);
    }
    clk_stop(&clk);
    char buffer[32];
    printf("%s: %s\n", tag, clk_elapsed_us(&clk, buffer, sizeof(buffer)));
}

int main(void)
{
    for (int i = 0; i < 10; i++)
    {
        test_converter("oid_btwoc_r1()", oid_btwoc_r1);
        test_converter("oid_btwoc_r2()", oid_btwoc_r2);
    }
    return(0);
}
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜