core/unicode/utf8 package | Odin Programming Language

Procedures and constants to support text-encoding in the `UTF-8` character encoding.

Collection Info

Collection: core
Path: unicode/utf8
Entries: 59

Constants 26

HICB LOCB MASK2 MASK3 MASK4 MASKX MAX_RUNE RUNE_BOM RUNE_EOF RUNE_ERROR RUNE_SELF RUNE1_MAX RUNE2_MAX RUNE3_MAX SURROGATE_HIGH_MAX SURROGATE_LOW_MIN +10 more

Types 4

Jump to Types

Accept_Range Grapheme Grapheme_Cluster_Sequence Grapheme_Iterator

Procedures 23

Jump to Procedures

decode_grapheme_clusters decode_grapheme_iterate decode_grapheme_iterator_make decode_last_rune_in_bytes decode_last_rune_in_string decode_rune_in_bytes decode_rune_in_string encode_rune full_rune_in_bytes full_rune_in_string grapheme_count rune_at rune_at_pos rune_count_in_bytes rune_count_in_string rune_offset +7 more

Procedure Groups 4

Jump to Procedure Groups

decode_last_rune decode_rune full_rune rune_count

Variables 2

Jump to Variables

accept_ranges accept_sizes

Source Files

Constants

HICB #

Source

HICB :: 0b1011_1111

LOCB #

Source

LOCB :: 0b1000_0000

The default lowest and highest continuation byte.

MASK2 #

Source

MASK2 :: 0b0001_1111

MASK3 #

Source

MASK3 :: 0b0000_1111

MASK4 #

Source

MASK4 :: 0b0000_0111

MASKX #

Source

MASKX :: 0b0011_1111

MAX_RUNE #

Source

MAX_RUNE :: '\U0010ffff'

RUNE_BOM #

Source

RUNE_BOM :: 0xfeff

RUNE_EOF #

Source

RUNE_EOF :: ~rune(0)

RUNE_ERROR #

Source

RUNE_ERROR :: '\ufffd'

RUNE_SELF #

Source

RUNE_SELF :: 0x80

RUNE1_MAX #

Source

RUNE1_MAX :: 1 << 7 - 1

RUNE2_MAX #

Source

RUNE2_MAX :: 1 << 11 - 1

RUNE3_MAX #

Source

RUNE3_MAX :: 1 << 16 - 1

SURROGATE_HIGH_MAX #

Source

SURROGATE_HIGH_MAX :: 0xdbff

A high/leading surrogate is in range SURROGATE_MIN..SURROGATE_HIGH_MAX, A low/trailing surrogate is in range SURROGATE_LOW_MIN..SURROGATE_MAX.

SURROGATE_LOW_MIN #

Source

SURROGATE_LOW_MIN :: 0xdc00

SURROGATE_MAX #

Source

SURROGATE_MAX :: 0xdfff

SURROGATE_MIN #

Source

SURROGATE_MIN :: 0xd800

T1 #

Source

T1 :: 0b0000_0000

T2 #

Source

T2 :: 0b1100_0000

T3 #

Source

T3 :: 0b1110_0000

T4 #

Source

T4 :: 0b1111_0000

T5 #

Source

T5 :: 0b1111_1000

TX #

Source

TX :: 0b1000_0000

UTF_MAX #

Source

UTF_MAX :: 4

ZERO_WIDTH_JOINER #

Source

ZERO_WIDTH_JOINER :: unicode.ZERO_WIDTH_JOINER

Types

Accept_Range #

Source

Accept_Range :: Accept_Range

Grapheme #

Source

Grapheme :: Grapheme

Grapheme_Cluster_Sequence #

Source

Grapheme_Cluster_Sequence :: Grapheme_Cluster_Sequence

Grapheme_Iterator #

Source

Grapheme_Iterator :: Grapheme_Iterator

Procedures

decode_grapheme_clusters #

Source

@(require_results)

decode_grapheme_clusters :: proc(str: string, track_graphemes: bool = true, allocator := context.allocator) -> (graphemes: [dynamic]Grapheme, grapheme_count: int, rune_count: int, width: int) {…}

Decode the individual graphemes in a UTF-8 string. *Allocates Using Provided Allocator* Inputs: - str: The input string. - track_graphemes: Whether or not to allocate and return `graphemes` with extra data about each grapheme. - allocator: (default: context.allocator) Returns: - graphemes: Extra data about each grapheme. - grapheme_count: The number of graphemes in the string. - rune_count: The number of runes in the string. - width: The width of the string in number of monospace cells.

decode_grapheme_iterate #

Source

@(require_results)

decode_grapheme_iterate :: proc(it: ^Grapheme_Iterator) -> (text: string, grapheme: Grapheme, ok: bool) {…}

decode_grapheme_iterator_make #

Source

@(require_results)

decode_grapheme_iterator_make :: proc(str: string) -> (it: Grapheme_Iterator) {…}

decode_last_rune_in_bytes #

Source

@(require_results)

decode_last_rune_in_bytes :: proc "contextless" (s: []u8) -> (rune, int) {…}

decode_last_rune_in_string #

Source

@(require_results)

decode_last_rune_in_string :: proc "contextless" (s: string) -> (rune, int) {…}

decode_rune_in_bytes #

Source

@(require_results)

decode_rune_in_bytes :: proc "contextless" (s: []u8) -> (rune, int) {…}

decode_rune_in_string #

Source

@(require_results)

decode_rune_in_string :: proc "contextless" (s: string) -> (rune, int) {…}

encode_rune #

Source

@(require_results)

encode_rune :: proc "contextless" (c: rune) -> ([4]u8, int) {…}

full_rune_in_bytes #

Source

@(require_results)

full_rune_in_bytes :: proc "contextless" (b: []u8) -> bool {…}

full_rune_in_bytes reports if the bytes in b begin with a full utf-8 encoding of a rune or not An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)

full_rune_in_string #

Source

@(require_results)

full_rune_in_string :: proc "contextless" (s: string) -> bool {…}

full_rune_in_string reports if the bytes in s begin with a full utf-8 encoding of a rune or not An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)

grapheme_count #

Source

@(require_results)

grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {…}

Count the individual graphemes in a UTF-8 string. Inputs: - str: The input string. Returns: - graphemes: The number of graphemes in the string. - runes: The number of runes in the string. - width: The width of the string in number of monospace cells.

rune_at #

Source

@(require_results)

rune_at :: proc "contextless" (s: string, byte_index: int) -> rune {…}

rune_at_pos #

Source

@(require_results)

rune_at_pos :: proc "contextless" (s: string, pos: int) -> rune {…}

rune_count_in_bytes #

Source

@(require_results)

rune_count_in_bytes :: proc "contextless" (s: []u8) -> int {…}

rune_count_in_string #

Source

@(require_results)

rune_count_in_string :: proc(s: string) -> int {…}

rune_offset #

Source

@(require_results)

rune_offset :: proc "contextless" (s: string, pos: int, start: int = 0) -> int {…}

Returns the byte position of rune at position pos in s with an optional start byte position. Returns -1 if it runs out of the string.

rune_size #

Source

@(require_results)

rune_size :: proc "contextless" (r: rune) -> int {…}

rune_start #

Source

@(require_results)

rune_start :: proc "contextless" (b: u8) -> bool {…}

rune_string_at_pos #

Source

@(require_results)

rune_string_at_pos :: proc "contextless" (s: string, pos: int) -> string {…}

runes_to_string #

Source

@(require_results)

runes_to_string :: proc(runes: []rune, allocator := context.allocator) -> (s: string, err: Allocator_Error) #optional_ok {…}

string_to_runes #

Source

@(require_results)

string_to_runes :: proc(s: string, allocator := context.allocator) -> (runes: []rune, err: Allocator_Error) #optional_ok {…}

valid_rune #

Source

@(require_results)

valid_rune :: proc "contextless" (r: rune) -> bool {…}

valid_string #

Source

@(require_results)

valid_string :: proc "contextless" (s: string) -> bool {…}

Procedure Groups

decode_last_rune #

Source

decode_last_rune :: proc{
	decode_last_rune_in_string,
	decode_last_rune_in_bytes,
}

decode_rune #

Source

decode_rune :: proc{
	decode_rune_in_string,
	decode_rune_in_bytes,
}

full_rune #

Source

full_rune :: proc{
	full_rune_in_bytes,
	full_rune_in_string,
}

full_rune reports if the bytes in b begin with a full utf-8 encoding of a rune or not An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)

rune_count #

Source

rune_count :: proc{
	rune_count_in_string,
	rune_count_in_bytes,
}

Variables

accept_ranges #

Source

accept_ranges: [5]Accept_Range = [5]Accept_Range{{0x80, 0xbf}, {0xa0, 0xbf}, {0x80, 0x9f}, {0x90, 0xbf}, {0x80, 0x8f}}

accept_sizes #

Source

accept_sizes: [256]u8 = [256]u8{0x00 ..= 0x7f = 0xf0, 0x80 ..= 0xc1 = 0xf1, 0xc2 ..= 0xdf = 0x02, 0xe0 = 0x13, 0xe1 ..= 0xec = 0x03, 0xed = 0x23, 0xee ..= 0xef = 0x03, 0xf0 = 0x34, 0xf1 ..= 0xf3 = 0x04, 0xf4 = 0x44, 0xf5 ..= 0xff = 0xf1}