helpless/oogabooga/unicode.c



#define UTF16_SURROGATE_HIGH_START  0xD800
#define UTF16_SURROGATE_HIGH_END    0xDBFF
#define UTF16_SURROGATE_LOW_START   0xDC00
#define UTF16_SURROGATE_LOW_END     0xDFFF
#define UTF16_SURROGATE_OFFSET      0x10000
#define UTF16_SURROGATE_MASK        0x3FF

// Returns how many utf16 units were converted
int utf16_to_utf32(const u16 *utf16, u64 length, u32 *utf32) {
    if (length == 0 || utf16 == NULL || utf32 == NULL) {
        return -1;
    }

    u16 first = utf16[0];

    if (first >= UTF16_SURROGATE_HIGH_START && first <= UTF16_SURROGATE_HIGH_END) {
        if (length < 2) {
            return -1;
        }

        u16 second = utf16[1];
        if (second >= UTF16_SURROGATE_LOW_START && second <= UTF16_SURROGATE_LOW_END) {
            *utf32 = ((first - UTF16_SURROGATE_HIGH_START) << 10)
                     + (second - UTF16_SURROGATE_LOW_START)
                     + UTF16_SURROGATE_OFFSET;
            return 2;
        } else {
            return -1;
        }
    } else if (first >= UTF16_SURROGATE_LOW_START && first <= UTF16_SURROGATE_LOW_END) {
        return -1;
    } else {
        *utf32 = first;
        return 1;
    }
}

// Yoinked from jai Unicode.jai

#define UNI_REPLACEMENT_CHAR 0x0000FFFD
#define UNI_MAX_UTF32        0x7FFFFFFF
#define UNI_MAX_UTF16        0x0010FFFF
#define SURROGATES_START     0xD800
#define SURROGATES_END       0xDFFF

typedef struct {
	u32 utf32;
	s64 continuation_bytes;
	bool reached_end;
	bool error;
} Utf8_To_Utf32_Result;

const u8 trailing_bytes_for_utf8[] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
const u8 utf8_inital_byte_mask[] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };

Utf8_To_Utf32_Result utf8_to_utf32(u8 *s, s64 source_length, bool strict) {
    s64 continuation_bytes = trailing_bytes_for_utf8[s[0]];

    if (continuation_bytes + 1 > source_length) {
        return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, source_length, true, true};
    }

    u32 ch = s[0] & utf8_inital_byte_mask[continuation_bytes];

    for (u64 i = 1; i <= continuation_bytes; i++) {  // Do nothing if it is 0.
        ch = ch << 6;
        if (strict) if ((s[i] & 0xC0) != 0x80)  return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, i - 1, true, true};
    	ch |= s[i] & 0x3F;
    }

    if (strict) {
        if (ch > UNI_MAX_UTF16 ||
          (SURROGATES_START <= ch && ch <= SURROGATES_END) ||
          (ch <= 0x0000007F && continuation_bytes != 0) ||
          (ch <= 0x000007FF && continuation_bytes != 1) ||
          (ch <= 0x0000FFFF && continuation_bytes != 2) ||
          continuation_bytes > 3) {
            return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, continuation_bytes+1, true, true};
        }
    }

    if (ch > UNI_MAX_UTF32) {
        ch = UNI_REPLACEMENT_CHAR;
    }

	return (Utf8_To_Utf32_Result){ ch, continuation_bytes+1, false, false };
}

// Returns 0 on fail
u32 next_utf8(string *s) {
	Utf8_To_Utf32_Result result = utf8_to_utf32(s->data, s->count, false);

    s->data  += result.continuation_bytes;
    s->count -= result.continuation_bytes;
    assert(s->count >= 0);

	if (result.error) return 0;

    return result.utf32;
}
- User input - Input state polling & consuming - is_key_down, is_key_just_pressed, is_key_just_release - consume_key_down, consume_key_just_pressed, consume_key_just_released - Input events - Key event - Scroll event - Text event - unicode.c - utf16_to_utf32 2024-06-29 17:54:30 +02:00

			`#define UTF16_SURROGATE_HIGH_START 0xD800`
			`#define UTF16_SURROGATE_HIGH_END 0xDBFF`
			`#define UTF16_SURROGATE_LOW_START 0xDC00`
			`#define UTF16_SURROGATE_LOW_END 0xDFFF`
			`#define UTF16_SURROGATE_OFFSET 0x10000`
			`#define UTF16_SURROGATE_MASK 0x3FF`

			`// Returns how many utf16 units were converted`
			`int utf16_to_utf32(const u16 utf16, u64 length, u32 utf32) {`
			`if (length == 0 \|\| utf16 == NULL \|\| utf32 == NULL) {`
			`return -1;`
			`}`

			`u16 first = utf16[0];`

			`if (first >= UTF16_SURROGATE_HIGH_START && first <= UTF16_SURROGATE_HIGH_END) {`
			`if (length < 2) {`
			`return -1;`
			`}`

			`u16 second = utf16[1];`
			`if (second >= UTF16_SURROGATE_LOW_START && second <= UTF16_SURROGATE_LOW_END) {`
			`*utf32 = ((first - UTF16_SURROGATE_HIGH_START) << 10)`
			`+ (second - UTF16_SURROGATE_LOW_START)`
			`+ UTF16_SURROGATE_OFFSET;`
			`return 2;`
			`} else {`
			`return -1;`
			`}`
			`} else if (first >= UTF16_SURROGATE_LOW_START && first <= UTF16_SURROGATE_LOW_END) {`
			`return -1;`
			`} else {`
			`*utf32 = first;`
			`return 1;`
			`}`
- Text rendering - Font loading - Measuring for formatting & justification - Utf8 Glyph walking - Commented example in oogabooga/examples/text_rendering.c - Small 2D renderer refactor - Pass 8-bit integers "type" and "sampler_index" to shader - Sample texture differently depending on "type" (text or regular quad) - Sample with nearest/linear min/mag depending on sampler_index - Set min/mag filtering in Draw_Quad - Images are now created and deleted directly with gfx calls rather than deferring it for gfx_update. - We can now set image sub data with gfx_set_image_data() - Images are no longer hard coded to 4 channels - Utf8 utility: - utf8_to_utf32(): convert utf8 bytes to a single u32 codepoint - next_utf8(): Convert first utf8 character in a string to a u32 codepoint and advance the passed string to the next unicode - Renamed m4_multiply -> m4_mul for consistency - Refactored os window to be DPI aware (scaled_width vs pixel_width) - in minimal example, renamed hammer_xform -> rect_xform 2024-07-07 20:27:34 +02:00			`}`

			`// Yoinked from jai Unicode.jai`

			`#define UNI_REPLACEMENT_CHAR 0x0000FFFD`
			`#define UNI_MAX_UTF32 0x7FFFFFFF`
			`#define UNI_MAX_UTF16 0x0010FFFF`
			`#define SURROGATES_START 0xD800`
			`#define SURROGATES_END 0xDFFF`

			`typedef struct {`
			`u32 utf32;`
			`s64 continuation_bytes;`
			`bool reached_end;`
			`bool error;`
			`} Utf8_To_Utf32_Result;`

			`const u8 trailing_bytes_for_utf8[] = {`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5`
			`};`
			`const u8 utf8_inital_byte_mask[] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };`

			`Utf8_To_Utf32_Result utf8_to_utf32(u8 *s, s64 source_length, bool strict) {`
			`s64 continuation_bytes = trailing_bytes_for_utf8[s[0]];`

			`if (continuation_bytes + 1 > source_length) {`
			`return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, source_length, true, true};`
			`}`

			`u32 ch = s[0] & utf8_inital_byte_mask[continuation_bytes];`

			`for (u64 i = 1; i <= continuation_bytes; i++) { // Do nothing if it is 0.`
			`ch = ch << 6;`
			`if (strict) if ((s[i] & 0xC0) != 0x80) return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, i - 1, true, true};`
			`ch \|= s[i] & 0x3F;`
			`}`

			`if (strict) {`
			`if (ch > UNI_MAX_UTF16 \|\|`
			`(SURROGATES_START <= ch && ch <= SURROGATES_END) \|\|`
			`(ch <= 0x0000007F && continuation_bytes != 0) \|\|`
			`(ch <= 0x000007FF && continuation_bytes != 1) \|\|`
			`(ch <= 0x0000FFFF && continuation_bytes != 2) \|\|`
			`continuation_bytes > 3) {`
			`return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, continuation_bytes+1, true, true};`
			`}`
			`}`

			`if (ch > UNI_MAX_UTF32) {`
			`ch = UNI_REPLACEMENT_CHAR;`
			`}`

			`return (Utf8_To_Utf32_Result){ ch, continuation_bytes+1, false, false };`
			`}`

			`// Returns 0 on fail`
			`u32 next_utf8(string *s) {`
			`Utf8_To_Utf32_Result result = utf8_to_utf32(s->data, s->count, false);`

			`s->data += result.continuation_bytes;`
			`s->count -= result.continuation_bytes;`
			`assert(s->count >= 0);`

			`if (result.error) return 0;`

			`return result.utf32;`
- User input - Input state polling & consuming - is_key_down, is_key_just_pressed, is_key_just_release - consume_key_down, consume_key_just_pressed, consume_key_just_released - Input events - Key event - Scroll event - Text event - unicode.c - utf16_to_utf32 2024-06-29 17:54:30 +02:00			`}`