SWAR acceleration for UTF8 Hamming Distance #93

ashvardanian · 2024-02-18T03:23:15Z

The upcoming implementation contains only a serial variant. Once the demand for a more efficient implementation grows, one can add a SWAR accelerated implementation.

Draft

SZ_PUBLIC sz_size_t sz_hamming_distance_utf8( //
    sz_cptr_t a, sz_size_t a_length,          //
    sz_cptr_t b, sz_size_t b_length,          //
    sz_size_t bound) {

    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
    sz_cptr_t const a_min_end = a + min_length;
    sz_size_t distance = 0;
    bound = bound == 0 ? max_length : bound;

#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
    if (min_length >= SZ_SWAR_THRESHOLD) {
        sz_u64_vec_t a_vec, b_vec;
        sz_u64_vec_t a_2byte_vec, b_2byte_vec;
        sz_u64_vec_t prefixes_2byte_vec, match_vec, prefix_resolution_mask_vec;
        prefixes_2byte_vec.u64 = 0xC080C080C080C080ull;

        // Walk through both strings using SWAR and counting the number of differing characters.
        // At every point check if the next 8 bytes of content contain characters of equal length.
        for (; a + 8 <= a_min_end && distance < bound; a += 8, b += 8) {
            a_vec = sz_u64_load(a);
            b_vec = sz_u64_load(b);

            // Check if the prefixes contain up to 8x codepoints of length 1.
            // UTF8 1-byte codepoints look like: 0xxxxxxx.
            sz_size_t a_1byte_chars = sz_u64_ctz(a_vec.u64 & 0x8080808080808080ull) / 8;
            sz_size_t b_1byte_chars = sz_u64_ctz(b_vec.u64 & 0x8080808080808080ull) / 8;
            sz_size_t max_1byte_chars = sz_max_of_two(a_1byte_chars, b_1byte_chars);
            sz_size_t min_1byte_chars = sz_min_of_two(a_1byte_chars, b_1byte_chars);

            // Check if at least one of the strings starts with a 1-byte character.
            if (max_1byte_chars) {
                // One of the strings doesn't start with 1-byte characters.
                if (!a_1byte_chars && !b_1byte_chars) {
                    a += min_1byte_chars, b += min_1byte_chars;
                    continue;
                }
                // Both strings start with `min_1byte_chars`-many 1-byte characters.
                // Compare them using SWAR and skip.
                else {
                    match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
                    prefix_resolution_mask_vec = ...;
                    sz_size_t matching_prefix_length = sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
                    ...
                }
            }

            // Check if the prefixes contain up to 4x codepoints of length 2.
            // UTF8 2-byte codepoints look like: 110xxxxx 10xxxxxx.
            // Those bits will be matches with a 0xE0C0 repeated mask, matching the 0xC080 pattern.
            a_2byte_vec.u64 = a_vec.u64 & 0xE0C0E0C0E0C0E0C0ull;
            b_2byte_vec.u64 = b_vec.u64 & 0xE0C0E0C0E0C0E0C0ull;
            sz_size_t a_2byte_chars = sz_u64_ctz(_sz_u64_each_2byte_equal(a_2byte_vec, prefixes_2byte_vec).u64) / 16;
            sz_size_t b_2byte_chars = sz_u64_ctz(_sz_u64_each_2byte_equal(b_2byte_vec, prefixes_2byte_vec).u64) / 16;
            
        }
    }
#endif

    sz_rune_t a_rune, b_rune;
    sz_rune_length_t a_rune_length, b_rune_length;
    for (; a != a_min_end && distance < bound; a += a_rune_length, b += b_rune_length) {
        _sz_extract_utf8_rune(a, &a_rune_length, &a_rune);
        _sz_extract_utf8_rune(b, &b_rune_length, &b_rune);
        distance += (a_rune != b_rune);
    }

    return sz_min_of_two(distance, bound);
}

ashvardanian added good first issue Good for newcomers core Work on the algorithm design and implementation labels Feb 18, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

SWAR acceleration for UTF8 Hamming Distance #93

SWAR acceleration for UTF8 Hamming Distance #93

ashvardanian commented Feb 18, 2024

SWAR acceleration for UTF8 Hamming Distance #93

SWAR acceleration for UTF8 Hamming Distance #93

Comments

ashvardanian commented Feb 18, 2024