1 | // Copyright (c) Microsoft. All rights reserved.
|
---|
2 | // Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
---|
3 |
|
---|
4 | #ifdef __cplusplus
|
---|
5 | #include <cstdlib>
|
---|
6 | #include <cstddef>
|
---|
7 | #include <cstdint>
|
---|
8 | #else
|
---|
9 | #include <stdlib.h>
|
---|
10 | #include <stdbool.h>
|
---|
11 | #include <stddef.h>
|
---|
12 | #include <stdint.h>
|
---|
13 | #endif
|
---|
14 |
|
---|
15 | #include "azure_c_shared_utility/utf8_checker.h"
|
---|
16 |
|
---|
17 | bool utf8_checker_is_valid_utf8(const unsigned char* utf8_str, size_t length)
|
---|
18 | {
|
---|
19 | bool result;
|
---|
20 |
|
---|
21 | if (utf8_str == NULL)
|
---|
22 | {
|
---|
23 | /* Codes_SRS_UTF8_CHECKER_01_002: [ If utf8_checker_is_valid_utf8 is called with NULL utf8_str it shall return false. ]*/
|
---|
24 | result = false;
|
---|
25 | }
|
---|
26 | else
|
---|
27 | {
|
---|
28 | size_t pos = 0;
|
---|
29 |
|
---|
30 | /* Codes_SRS_UTF8_CHECKER_01_003: [ If length is 0, utf8_checker_is_valid_utf8 shall consider utf8_str to be valid UTF-8 and return true. ]*/
|
---|
31 | result = true;
|
---|
32 |
|
---|
33 | while ((result == true) &&
|
---|
34 | (pos < length))
|
---|
35 | {
|
---|
36 | /* Codes_SRS_UTF8_CHECKER_01_001: [ utf8_checker_is_valid_utf8 shall verify that the sequence of chars pointed to by utf8_str represent UTF-8 encoded codepoints. ]*/
|
---|
37 | if ((utf8_str[pos] >> 3) == 0x1E)
|
---|
38 | {
|
---|
39 | /* 4 bytes */
|
---|
40 | /* Codes_SRS_UTF8_CHECKER_01_009: [ 000uuuuu zzzzyyyy yyxxxxxx 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]*/
|
---|
41 | uint32_t code_point = (utf8_str[pos] & 0x07);
|
---|
42 |
|
---|
43 | pos++;
|
---|
44 | if ((pos < length) &&
|
---|
45 | ((utf8_str[pos] >> 6) == 0x02))
|
---|
46 | {
|
---|
47 | code_point <<= 6;
|
---|
48 | code_point += utf8_str[pos] & 0x3F;
|
---|
49 |
|
---|
50 | pos++;
|
---|
51 | if ((pos < length) &&
|
---|
52 | ((utf8_str[pos] >> 6) == 0x02))
|
---|
53 | {
|
---|
54 | code_point <<= 6;
|
---|
55 | code_point += utf8_str[pos] & 0x3F;
|
---|
56 |
|
---|
57 | pos++;
|
---|
58 | if ((pos < length) &&
|
---|
59 | ((utf8_str[pos] >> 6) == 0x02))
|
---|
60 | {
|
---|
61 | code_point <<= 6;
|
---|
62 | code_point += utf8_str[pos] & 0x3F;
|
---|
63 |
|
---|
64 | if (code_point <= 0xFFFF)
|
---|
65 | {
|
---|
66 | result = false;
|
---|
67 | }
|
---|
68 | else
|
---|
69 | {
|
---|
70 | /* Codes_SRS_UTF8_CHECKER_01_005: [ On success it shall return true. ]*/
|
---|
71 | result = true;
|
---|
72 | pos++;
|
---|
73 | }
|
---|
74 | }
|
---|
75 | else
|
---|
76 | {
|
---|
77 | result = false;
|
---|
78 | }
|
---|
79 | }
|
---|
80 | else
|
---|
81 | {
|
---|
82 | result = false;
|
---|
83 | }
|
---|
84 | }
|
---|
85 | else
|
---|
86 | {
|
---|
87 | result = false;
|
---|
88 | }
|
---|
89 | }
|
---|
90 | else if ((utf8_str[pos] >> 4) == 0x0E)
|
---|
91 | {
|
---|
92 | /* 3 bytes */
|
---|
93 | /* Codes_SRS_UTF8_CHECKER_01_008: [ zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx ]*/
|
---|
94 | uint32_t code_point = (utf8_str[pos] & 0x0F);
|
---|
95 |
|
---|
96 | pos++;
|
---|
97 | if ((pos < length) &&
|
---|
98 | ((utf8_str[pos] >> 6) == 0x02))
|
---|
99 | {
|
---|
100 | code_point <<= 6;
|
---|
101 | code_point += utf8_str[pos] & 0x3F;
|
---|
102 |
|
---|
103 | pos++;
|
---|
104 | if ((pos < length) &&
|
---|
105 | ((utf8_str[pos] >> 6) == 0x02))
|
---|
106 | {
|
---|
107 | code_point <<= 6;
|
---|
108 | code_point += utf8_str[pos] & 0x3F;
|
---|
109 |
|
---|
110 | if (code_point <= 0x7FF)
|
---|
111 | {
|
---|
112 | result = false;
|
---|
113 | }
|
---|
114 | else
|
---|
115 | {
|
---|
116 | /* Codes_SRS_UTF8_CHECKER_01_005: [ On success it shall return true. ]*/
|
---|
117 | result = true;
|
---|
118 | pos++;
|
---|
119 | }
|
---|
120 | }
|
---|
121 | else
|
---|
122 | {
|
---|
123 | result = false;
|
---|
124 | }
|
---|
125 | }
|
---|
126 | else
|
---|
127 | {
|
---|
128 | result = false;
|
---|
129 | }
|
---|
130 | }
|
---|
131 | else if ((utf8_str[pos] >> 5) == 0x06)
|
---|
132 | {
|
---|
133 | /* 2 bytes */
|
---|
134 | /* Codes_SRS_UTF8_CHECKER_01_007: [ 00000yyy yyxxxxxx 110yyyyy 10xxxxxx ]*/
|
---|
135 | uint32_t code_point = (utf8_str[pos] & 0x1F);
|
---|
136 |
|
---|
137 | pos++;
|
---|
138 | if ((pos < length) &&
|
---|
139 | ((utf8_str[pos] >> 6) == 0x02))
|
---|
140 | {
|
---|
141 | code_point <<= 6;
|
---|
142 | code_point += utf8_str[pos] & 0x3F;
|
---|
143 |
|
---|
144 | if (code_point <= 0x7F)
|
---|
145 | {
|
---|
146 | result = false;
|
---|
147 | }
|
---|
148 | else
|
---|
149 | {
|
---|
150 | /* Codes_SRS_UTF8_CHECKER_01_005: [ On success it shall return true. ]*/
|
---|
151 | result = true;
|
---|
152 | pos++;
|
---|
153 | }
|
---|
154 | }
|
---|
155 | else
|
---|
156 | {
|
---|
157 | result = false;
|
---|
158 | }
|
---|
159 | }
|
---|
160 | else if ((utf8_str[pos] >> 7) == 0x00)
|
---|
161 | {
|
---|
162 | /* 1 byte */
|
---|
163 | /* Codes_SRS_UTF8_CHECKER_01_006: [ 00000000 0xxxxxxx 0xxxxxxx ]*/
|
---|
164 | /* Codes_SRS_UTF8_CHECKER_01_005: [ On success it shall return true. ]*/
|
---|
165 | result = true;
|
---|
166 | pos++;
|
---|
167 | }
|
---|
168 | else
|
---|
169 | {
|
---|
170 | /* error */
|
---|
171 | result = false;
|
---|
172 | }
|
---|
173 | }
|
---|
174 | }
|
---|
175 |
|
---|
176 | return result;
|
---|
177 | }
|
---|