helpless/oogabooga/unicode.c



#define UTF16_SURROGATE_HIGH_START  0xD800
#define UTF16_SURROGATE_HIGH_END    0xDBFF
#define UTF16_SURROGATE_LOW_START   0xDC00
#define UTF16_SURROGATE_LOW_END     0xDFFF
#define UTF16_SURROGATE_OFFSET      0x10000
#define UTF16_SURROGATE_MASK        0x3FF

// Returns how many utf16 units were converted
int utf16_to_utf32(const u16 *utf16, u64 length, u32 *utf32) {
    if (length == 0 || utf16 == NULL || utf32 == NULL) {
        return -1;
    }

    u16 first = utf16[0];

    if (first >= UTF16_SURROGATE_HIGH_START && first <= UTF16_SURROGATE_HIGH_END) {
        if (length < 2) {
            return -1;
        }

        u16 second = utf16[1];
        if (second >= UTF16_SURROGATE_LOW_START && second <= UTF16_SURROGATE_LOW_END) {
            *utf32 = ((first - UTF16_SURROGATE_HIGH_START) << 10)
                     + (second - UTF16_SURROGATE_LOW_START)
                     + UTF16_SURROGATE_OFFSET;
            return 2;
        } else {
            return -1;
        }
    } else if (first >= UTF16_SURROGATE_LOW_START && first <= UTF16_SURROGATE_LOW_END) {
        return -1;
    } else {
        *utf32 = first;
        return 1;
    }
}

// Yoinked from jai Unicode.jai

#define UNI_REPLACEMENT_CHAR 0x0000FFFD
#define UNI_MAX_UTF32        0x7FFFFFFF
#define UNI_MAX_UTF16        0x0010FFFF
#define SURROGATES_START     0xD800
#define SURROGATES_END       0xDFFF

typedef struct {
	u32 utf32;
	s64 continuation_bytes;
	bool reached_end;
	bool error;
} Utf8_To_Utf32_Result;

const u8 trailing_bytes_for_utf8[] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
const u8 utf8_inital_byte_mask[] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };

Utf8_To_Utf32_Result utf8_to_utf32(u8 *s, s64 source_length, bool strict) {
    s64 continuation_bytes = trailing_bytes_for_utf8[s[0]];

    if (continuation_bytes + 1 > source_length) {
        return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, source_length, true, true};
    }

    u32 ch = s[0] & utf8_inital_byte_mask[continuation_bytes];

    for (u64 i = 1; i <= continuation_bytes; i++) {  // Do nothing if it is 0.
        ch = ch << 6;
        if (strict) if ((s[i] & 0xC0) != 0x80)  return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, i - 1, true, true};
    	ch |= s[i] & 0x3F;
    }

    if (strict) {
        if (ch > UNI_MAX_UTF16 ||
          (SURROGATES_START <= ch && ch <= SURROGATES_END) ||
          (ch <= 0x0000007F && continuation_bytes != 0) ||
          (ch <= 0x000007FF && continuation_bytes != 1) ||
          (ch <= 0x0000FFFF && continuation_bytes != 2) ||
          continuation_bytes > 3) {
            return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, continuation_bytes+1, true, true};
        }
    }

    if (ch > UNI_MAX_UTF32) {
        ch = UNI_REPLACEMENT_CHAR;
    }

	return (Utf8_To_Utf32_Result){ ch, continuation_bytes+1, false, false };
}

// Returns 0 on fail
u32 next_utf8(string *s) {
	Utf8_To_Utf32_Result result = utf8_to_utf32(s->data, s->count, false);

    s->data  += result.continuation_bytes;
    s->count -= result.continuation_bytes;
    assert(s->count >= 0);

	if (result.error) return 0;

    return result.utf32;
}

u64 utf8_index_to_byte_index(string str, u64 index) {
	u64 byte_index = 0;
	u64 utf8_index = 0;
	while (utf8_index < index && str.count != 0) {
		string last_str = str;
		u32 codepoint = next_utf8(&str);
		if (!codepoint) break;

		u64 byte_diff = ((u8*)str.data)-((u8*)last_str.data);
		assert(byte_diff != 0);
		byte_index += byte_diff;
		utf8_index += 1;
	}
	return byte_index;
}
string utf8_slice(string str, u64 index, u64 count) {
	u64 byte_index = utf8_index_to_byte_index(str, index);
	u64 byte_end_index = utf8_index_to_byte_index(str, index+count);
	u64 byte_count = byte_end_index - byte_index;

	return string_view(str, byte_index, byte_count);
}