Source: lib/util/string_utils.js

  1. /*! @license
  2. * Shaka Player
  3. * Copyright 2016 Google LLC
  4. * SPDX-License-Identifier: Apache-2.0
  5. */
  6. goog.provide('shaka.util.StringUtils');
  7. goog.require('goog.asserts');
  8. goog.require('shaka.device.DeviceFactory');
  9. goog.require('shaka.log');
  10. goog.require('shaka.util.BufferUtils');
  11. goog.require('shaka.util.Error');
  12. goog.require('shaka.util.Lazy');
  13. /**
  14. * @namespace shaka.util.StringUtils
  15. * @summary A set of string utility functions.
  16. * @export
  17. */
  18. shaka.util.StringUtils = class {
  19. /**
  20. * Creates a string from the given buffer as UTF-8 encoding.
  21. *
  22. * @param {?BufferSource} data
  23. * @return {string}
  24. * @export
  25. */
  26. static fromUTF8(data) {
  27. if (!data) {
  28. return '';
  29. }
  30. let uint8 = shaka.util.BufferUtils.toUint8(data);
  31. // If present, strip off the UTF-8 BOM.
  32. if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
  33. uint8 = uint8.subarray(3);
  34. }
  35. if (window.TextDecoder && !shaka.device.DeviceFactory.getDevice()
  36. .shouldAvoidUseTextDecoderEncoder()) {
  37. // Use the TextDecoder interface to decode the text. This has the
  38. // advantage compared to the previously-standard decodeUriComponent that
  39. // it will continue parsing even if it finds an invalid UTF8 character,
  40. // rather than stop and throw an error.
  41. const utf8decoder = new TextDecoder();
  42. const decoded = utf8decoder.decode(uint8);
  43. if (decoded.includes('\uFFFD')) {
  44. shaka.log.alwaysError('Decoded string contains an "unknown character' +
  45. '" codepoint. That probably means the UTF8 ' +
  46. 'encoding was incorrect!');
  47. }
  48. return decoded;
  49. } else {
  50. // Homebrewed UTF-8 decoder based on
  51. // https://en.wikipedia.org/wiki/UTF-8#Encoding
  52. // Unlike decodeURIComponent, won't throw on bad encoding.
  53. // In this way, it is similar to TextDecoder.
  54. let decoded = '';
  55. for (let i = 0; i < uint8.length; ++i) {
  56. // By default, the "replacement character" codepoint.
  57. let codePoint = 0xFFFD;
  58. // Top bit is 0, 1-byte encoding.
  59. if ((uint8[i] & 0x80) == 0) {
  60. codePoint = uint8[i];
  61. // Top 3 bits of byte 0 are 110, top 2 bits of byte 1 are 10,
  62. // 2-byte encoding.
  63. } else if (uint8.length >= i + 2 &&
  64. (uint8[i] & 0xe0) == 0xc0 &&
  65. (uint8[i + 1] & 0xc0) == 0x80) {
  66. codePoint = ((uint8[i] & 0x1f) << 6) |
  67. ((uint8[i + 1] & 0x3f));
  68. i += 1; // Consume one extra byte.
  69. // Top 4 bits of byte 0 are 1110, top 2 bits of byte 1 and 2 are 10,
  70. // 3-byte encoding.
  71. } else if (uint8.length >= i + 3 &&
  72. (uint8[i] & 0xf0) == 0xe0 &&
  73. (uint8[i + 1] & 0xc0) == 0x80 &&
  74. (uint8[i + 2] & 0xc0) == 0x80) {
  75. codePoint = ((uint8[i] & 0x0f) << 12) |
  76. ((uint8[i + 1] & 0x3f) << 6) |
  77. ((uint8[i + 2] & 0x3f));
  78. i += 2; // Consume two extra bytes.
  79. // Top 5 bits of byte 0 are 11110, top 2 bits of byte 1, 2 and 3 are 10,
  80. // 4-byte encoding.
  81. } else if (uint8.length >= i + 4 &&
  82. (uint8[i] & 0xf1) == 0xf0 &&
  83. (uint8[i + 1] & 0xc0) == 0x80 &&
  84. (uint8[i + 2] & 0xc0) == 0x80 &&
  85. (uint8[i + 3] & 0xc0) == 0x80) {
  86. codePoint = ((uint8[i] & 0x07) << 18) |
  87. ((uint8[i + 1] & 0x3f) << 12) |
  88. ((uint8[i + 2] & 0x3f) << 6) |
  89. ((uint8[i + 3] & 0x3f));
  90. i += 3; // Consume three extra bytes.
  91. }
  92. // JavaScript strings are a series of UTF-16 characters.
  93. if (codePoint <= 0xffff) {
  94. decoded += String.fromCharCode(codePoint);
  95. } else {
  96. // UTF-16 surrogate-pair encoding, based on
  97. // https://en.wikipedia.org/wiki/UTF-16#Description
  98. const baseCodePoint = codePoint - 0x10000;
  99. const highPart = baseCodePoint >> 10;
  100. const lowPart = baseCodePoint & 0x3ff;
  101. decoded += String.fromCharCode(0xd800 + highPart);
  102. decoded += String.fromCharCode(0xdc00 + lowPart);
  103. }
  104. }
  105. return decoded;
  106. }
  107. }
  108. /**
  109. * Creates a string from the given buffer as UTF-16 encoding.
  110. *
  111. * @param {?BufferSource} data
  112. * @param {boolean} littleEndian
  113. true to read little endian, false to read big.
  114. * @param {boolean=} noThrow true to avoid throwing in cases where we may
  115. * expect invalid input. If noThrow is true and the data has an odd
  116. * length,it will be truncated.
  117. * @return {string}
  118. * @export
  119. */
  120. static fromUTF16(data, littleEndian, noThrow) {
  121. if (!data) {
  122. return '';
  123. }
  124. if (!noThrow && data.byteLength % 2 != 0) {
  125. shaka.log.error('Data has an incorrect length, must be even.');
  126. throw new shaka.util.Error(
  127. shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
  128. shaka.util.Error.Code.BAD_ENCODING);
  129. }
  130. // Use a DataView to ensure correct endianness.
  131. const length = Math.floor(data.byteLength / 2);
  132. const arr = new Uint16Array(length);
  133. const dataView = shaka.util.BufferUtils.toDataView(data);
  134. for (let i = 0; i < length; i++) {
  135. arr[i] = dataView.getUint16(i * 2, littleEndian);
  136. }
  137. return shaka.util.StringUtils.fromCharCode(arr);
  138. }
  139. /**
  140. * Creates a string from the given buffer, auto-detecting the encoding that is
  141. * being used. If it cannot detect the encoding, it will throw an exception.
  142. *
  143. * @param {?BufferSource} data
  144. * @return {string}
  145. * @export
  146. */
  147. static fromBytesAutoDetect(data) {
  148. const StringUtils = shaka.util.StringUtils;
  149. if (!data) {
  150. return '';
  151. }
  152. const uint8 = shaka.util.BufferUtils.toUint8(data);
  153. if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
  154. return StringUtils.fromUTF8(uint8);
  155. } else if (uint8[0] == 0xfe && uint8[1] == 0xff) {
  156. return StringUtils.fromUTF16(
  157. uint8.subarray(2), /* littleEndian= */ false);
  158. } else if (uint8[0] == 0xff && uint8[1] == 0xfe) {
  159. return StringUtils.fromUTF16(uint8.subarray(2), /* littleEndian= */ true);
  160. }
  161. const isAscii = (i) => {
  162. // arr[i] >= horizontal tab && arr[i] <= '~';
  163. return uint8.byteLength <= i || (uint8[i] >= 0x09 && uint8[i] <= 0x7e);
  164. };
  165. shaka.log.debug(
  166. 'Unable to find byte-order-mark, making an educated guess.');
  167. if (uint8[0] == 0 && uint8[2] == 0) {
  168. return StringUtils.fromUTF16(data, /* littleEndian= */ false);
  169. } else if (uint8[1] == 0 && uint8[3] == 0) {
  170. return StringUtils.fromUTF16(data, /* littleEndian= */ true);
  171. } else if (isAscii(0) && isAscii(1) && isAscii(2) && isAscii(3)) {
  172. return StringUtils.fromUTF8(data);
  173. }
  174. throw new shaka.util.Error(
  175. shaka.util.Error.Severity.CRITICAL,
  176. shaka.util.Error.Category.TEXT,
  177. shaka.util.Error.Code.UNABLE_TO_DETECT_ENCODING);
  178. }
  179. /**
  180. * Creates a ArrayBuffer from the given string, converting to UTF-8 encoding.
  181. *
  182. * @param {string} str
  183. * @return {!ArrayBuffer}
  184. * @export
  185. */
  186. static toUTF8(str) {
  187. if (window.TextEncoder && !shaka.device.DeviceFactory.getDevice()
  188. .shouldAvoidUseTextDecoderEncoder()) {
  189. const utf8Encoder = new TextEncoder();
  190. return shaka.util.BufferUtils.toArrayBuffer(utf8Encoder.encode(str));
  191. } else {
  192. // http://stackoverflow.com/a/13691499
  193. // Converts the given string to a URI encoded string. If a character
  194. // falls in the ASCII range, it is not converted; otherwise it will be
  195. // converted to a series of URI escape sequences according to UTF-8.
  196. // Example: 'g#€' -> 'g#%E3%82%AC'
  197. const encoded = encodeURIComponent(str);
  198. // Convert each escape sequence individually into a character. Each
  199. // escape sequence is interpreted as a code-point, so if an escape
  200. // sequence happens to be part of a multi-byte sequence, each byte will
  201. // be converted to a single character.
  202. // Example: 'g#%E3%82%AC' -> '\x67\x35\xe3\x82\xac'
  203. const utf8 = unescape(encoded);
  204. const result = new Uint8Array(utf8.length);
  205. for (let i = 0; i < utf8.length; i++) {
  206. const item = utf8[i];
  207. result[i] = item.charCodeAt(0);
  208. }
  209. return shaka.util.BufferUtils.toArrayBuffer(result);
  210. }
  211. }
  212. /**
  213. * Creates a ArrayBuffer from the given string, converting to UTF-16 encoding.
  214. *
  215. * @param {string} str
  216. * @param {boolean} littleEndian
  217. * @return {!ArrayBuffer}
  218. * @export
  219. */
  220. static toUTF16(str, littleEndian) {
  221. const result = new ArrayBuffer(str.length * 2);
  222. const view = new DataView(result);
  223. for (let i = 0; i < str.length; ++i) {
  224. const value = str.charCodeAt(i);
  225. view.setUint16(/* position= */ i * 2, value, littleEndian);
  226. }
  227. return result;
  228. }
  229. /**
  230. * Creates a new string from the given array of char codes.
  231. *
  232. * Using String.fromCharCode.apply is risky because you can trigger stack
  233. * errors on very large arrays. This breaks up the array into several pieces
  234. * to avoid this.
  235. *
  236. * @param {!TypedArray} array
  237. * @return {string}
  238. */
  239. static fromCharCode(array) {
  240. return shaka.util.StringUtils.fromCharCodeImpl_.value()(array);
  241. }
  242. /**
  243. * Resets the fromCharCode method's implementation.
  244. * For debug use.
  245. * @export
  246. */
  247. static resetFromCharCode() {
  248. shaka.util.StringUtils.fromCharCodeImpl_.reset();
  249. }
  250. /**
  251. * This method converts the HTML entities &amp;, &lt;, &gt;, &quot;, &#39;,
  252. * &nbsp;, &lrm; and &rlm; in string to their corresponding characters.
  253. *
  254. * @param {!string} input
  255. * @return {string}
  256. */
  257. static htmlUnescape(input) {
  258. // Used to map HTML entities to characters.
  259. const htmlUnescapes = {
  260. '&amp;': '&',
  261. '&lt;': '<',
  262. '&gt;': '>',
  263. '&quot;': '"',
  264. '&apos;': '\'',
  265. '&nbsp;': '\u{a0}',
  266. '&lrm;': '\u{200e}',
  267. '&rlm;': '\u{200f}',
  268. };
  269. // Used to match HTML entities and HTML characters.
  270. const reEscapedHtml =
  271. /&(?:amp|lt|gt|quot|apos|nbsp|lrm|rlm|#[xX]?[0-9a-fA-F]+);/g;
  272. const reHasEscapedHtml = RegExp(reEscapedHtml.source);
  273. // This check is an optimization, since replace always makes a copy
  274. if (input && reHasEscapedHtml.test(input)) {
  275. return input.replace(reEscapedHtml, (entity) => {
  276. if (entity[1] == '#') {
  277. // Translate this into an HTML character.
  278. let code = 0;
  279. if (entity[2] == 'x' || entity[2] == 'X') {
  280. // It's hex.
  281. code = parseInt(entity.substring(3), 16);
  282. } else {
  283. // It's decimal.
  284. code = parseInt(entity.substring(2), 10);
  285. }
  286. // Ignore it if it's an invalid code point.
  287. if (code >= 0 && code <= 0x10FFFF) {
  288. return String.fromCodePoint(code);
  289. } else {
  290. return '';
  291. }
  292. }
  293. // The only thing that might not match the dictionary above is the
  294. // single quote, which can be matched by many strings in the regex, but
  295. // only has a single entry in the dictionary.
  296. return htmlUnescapes[entity] || '\'';
  297. });
  298. }
  299. return input || '';
  300. }
  301. };
  302. /** @private {!shaka.util.Lazy.<function(!TypedArray):string>} */
  303. shaka.util.StringUtils.fromCharCodeImpl_ = new shaka.util.Lazy(() => {
  304. /**
  305. * @param {number} size
  306. * @return {boolean}
  307. */
  308. const supportsChunkSize = (size) => {
  309. try {
  310. // The compiler will complain about suspicious value if this isn't
  311. // stored in a variable and used.
  312. const buffer = new Uint8Array(size);
  313. // This can't use the spread operator, or it blows up on Xbox One.
  314. // So we use apply() instead, which is normally not allowed.
  315. // See issue #2186 for more details.
  316. const foo = String.fromCharCode.apply(null, buffer);
  317. goog.asserts.assert(foo, 'Should get value');
  318. return foo.length > 0; // Actually use "foo", so it's not compiled out.
  319. } catch (error) {
  320. return false;
  321. }
  322. };
  323. // Different browsers support different chunk sizes; find out the largest
  324. // this browser supports so we can use larger chunks on supported browsers
  325. // but still support lower-end devices that require small chunks.
  326. // 64k is supported on all major desktop browsers.
  327. for (let size = 64 * 1024; size > 0; size /= 2) {
  328. if (supportsChunkSize(size)) {
  329. return (buffer) => {
  330. let ret = '';
  331. for (let i = 0; i < buffer.length; i += size) {
  332. const subArray = buffer.subarray(i, i + size);
  333. // This can't use the spread operator, or it blows up on Xbox One.
  334. // So we use apply() instead, which is normally not allowed.
  335. // See issue #2186 for more details.
  336. ret += String.fromCharCode.apply(null, subArray); // Issue #2186
  337. }
  338. return ret;
  339. };
  340. }
  341. }
  342. goog.asserts.assert(false, 'Unable to create a fromCharCode method');
  343. return null;
  344. });