mirror of
https://github.com/c-cube/linol.git
synced 2025-12-07 03:35:41 -05:00
109 lines
33 KiB
HTML
109 lines
33 KiB
HTML
<!DOCTYPE html>
|
||
<html xmlns="http://www.w3.org/1999/xhtml"><head><title>Uutf (uutf.Uutf)</title><meta charset="utf-8"/><link rel="stylesheet" href="../../_odoc-theme/odoc.css"/><meta name="generator" content="odoc 2.4.2"/><meta name="viewport" content="width=device-width,initial-scale=1.0"/><script src="../../highlight.pack.js"></script><script>hljs.initHighlightingOnLoad();</script></head><body class="odoc"><nav class="odoc-nav"><a href="../index.html">Up</a> – <a href="../index.html">uutf</a> » Uutf</nav><header class="odoc-preamble"><h1>Module <code><span>Uutf</span></code></h1><p>Non-blocking streaming Unicode codec.</p><p><code>Uutf</code> is a non-blocking streaming codec to <a href="#decode">decode</a> and <a href="#encode">encode</a> the <a href="http://www.ietf.org/rfc/rfc3629.txt">UTF-8</a>, <a href="http://www.ietf.org/rfc/rfc2781.txt">UTF-16</a>, UTF-16LE and UTF-16BE encoding schemes. It can efficiently work character by character without blocking on IO. Decoders perform character position tracking and support <a href="#type-nln" title="nln">newline normalization</a>.</p><p>Functions are also provided to <a href="String/index.html" title="String">fold over</a> the characters of UTF encoded OCaml string values and to <a href="Buffer/index.html" title="Buffer">directly encode</a> characters in OCaml <a href="../../ocaml/Stdlib/Buffer/index.html#type-t"><code>Stdlib.Buffer.t</code></a> values. <b>Note</b> that since OCaml 4.14, that functionality can be found in <a href="../../ocaml/Stdlib/String/index.html"><code>Stdlib.String</code></a> and <a href="../../ocaml/Stdlib/Buffer/index.html"><code>Stdlib.Buffer</code></a> and you are encouraged to migrate to it.</p><p>See <a href="#examples">examples</a> of use.</p><p><b>References</b></p><ul><li>The Unicode Consortium. <em><a href="http://www.unicode.org/versions/latest">The Unicode Standard</a></em>. (latest version)</li></ul></header><nav class="odoc-toc"><ul><li><a href="#ucharcsts">Special Unicode characters</a></li><li><a href="#schemes">Unicode encoding schemes</a></li><li><a href="#decode">Decode</a></li><li><a href="#encode">Encode</a></li><li><a href="#manual">Manual sources and destinations.</a></li><li><a href="#strbuf">String folders and Buffer encoders</a></li><li><a href="#examples">Examples</a><ul><li><a href="#readlines">Read lines</a></li><li><a href="#recode">Recode</a></li></ul></li></ul></nav><div class="odoc-content"><h2 id="ucharcsts"><a href="#ucharcsts" class="anchor"></a>Special Unicode characters</h2><div class="odoc-spec"><div class="spec value anchored" id="val-u_bom"><a href="#val-u_bom" class="anchor"></a><code><span><span class="keyword">val</span> u_bom : <a href="../../ocaml/Stdlib/Uchar/index.html#type-t">Stdlib.Uchar.t</a></span></code></div><div class="spec-doc"><p><code>u_bom</code> is the <a href="http://unicode.org/glossary/#byte_order_mark">byte order mark</a> (BOM) character (<code>U+FEFF</code>). From OCaml 4.06 on, use <code>Uchar.bom</code>.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-u_rep"><a href="#val-u_rep" class="anchor"></a><code><span><span class="keyword">val</span> u_rep : <a href="../../ocaml/Stdlib/Uchar/index.html#type-t">Stdlib.Uchar.t</a></span></code></div><div class="spec-doc"><p><code>u_rep</code> is the <a href="http://unicode.org/glossary/#replacement_character">replacement</a> character (<code>U+FFFD</code>). From OCaml 4.06 on, use <code>Uchar.rep</code>.</p></div></div><h2 id="schemes"><a href="#schemes" class="anchor"></a>Unicode encoding schemes</h2><div class="odoc-spec"><div class="spec type anchored" id="type-encoding"><a href="#type-encoding" class="anchor"></a><code><span><span class="keyword">type</span> encoding</span><span> = </span><span>[ </span></code><ol><li id="type-encoding.UTF_16" class="def variant constructor anchored"><a href="#type-encoding.UTF_16" class="anchor"></a><code><span>| </span><span>`UTF_16</span></code></li><li id="type-encoding.UTF_16BE" class="def variant constructor anchored"><a href="#type-encoding.UTF_16BE" class="anchor"></a><code><span>| </span><span>`UTF_16BE</span></code></li><li id="type-encoding.UTF_16LE" class="def variant constructor anchored"><a href="#type-encoding.UTF_16LE" class="anchor"></a><code><span>| </span><span>`UTF_16LE</span></code></li><li id="type-encoding.UTF_8" class="def variant constructor anchored"><a href="#type-encoding.UTF_8" class="anchor"></a><code><span>| </span><span>`UTF_8</span></code></li></ol><code><span> ]</span></code></div><div class="spec-doc"><p>The type for Unicode <a href="http://unicode.org/glossary/#character_encoding_scheme">encoding schemes</a>.</p></div></div><div class="odoc-spec"><div class="spec type anchored" id="type-decoder_encoding"><a href="#type-decoder_encoding" class="anchor"></a><code><span><span class="keyword">type</span> decoder_encoding</span><span> = </span><span>[ </span></code><ol><li id="type-decoder_encoding.encoding" class="def variant type anchored"><a href="#type-decoder_encoding.encoding" class="anchor"></a><code><span>| </span><span><a href="#type-encoding">encoding</a></span></code></li><li id="type-decoder_encoding.US_ASCII" class="def variant constructor anchored"><a href="#type-decoder_encoding.US_ASCII" class="anchor"></a><code><span>| </span><span>`US_ASCII</span></code></li><li id="type-decoder_encoding.ISO_8859_1" class="def variant constructor anchored"><a href="#type-decoder_encoding.ISO_8859_1" class="anchor"></a><code><span>| </span><span>`ISO_8859_1</span></code></li></ol><code><span> ]</span></code></div><div class="spec-doc"><p>The type for encoding schemes <em>decoded</em> by <code>Uutf</code>. Unicode encoding schemes plus <a href="http://tools.ietf.org/html/rfc20">US-ASCII</a> and <a href="http://www.ecma-international.org/publications/standards/Ecma-094.htm">ISO/IEC 8859-1</a> (latin-1).</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-encoding_of_string"><a href="#val-encoding_of_string" class="anchor"></a><code><span><span class="keyword">val</span> encoding_of_string : <span>string <span class="arrow">-></span></span> <span><a href="#type-decoder_encoding">decoder_encoding</a> option</span></span></code></div><div class="spec-doc"><p><code>encoding_of_string s</code> converts a (case insensitive) <a href="http://www.iana.org/assignments/character-sets">IANA character set name</a> to an encoding.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-encoding_to_string"><a href="#val-encoding_to_string" class="anchor"></a><code><span><span class="keyword">val</span> encoding_to_string : <span><span>[< <a href="#type-decoder_encoding">decoder_encoding</a> ]</span> <span class="arrow">-></span></span> string</span></code></div><div class="spec-doc"><p><code>encoding_to_string e</code> is a <a href="http://www.iana.org/assignments/character-sets">IANA character set name</a> for <code>e</code>.</p></div></div><h2 id="decode"><a href="#decode" class="anchor"></a>Decode</h2><div class="odoc-spec"><div class="spec type anchored" id="type-src"><a href="#type-src" class="anchor"></a><code><span><span class="keyword">type</span> src</span><span> = </span><span>[ </span></code><ol><li id="type-src.Channel" class="def variant constructor anchored"><a href="#type-src.Channel" class="anchor"></a><code><span>| </span><span>`Channel <span class="keyword">of</span> <a href="../../ocaml/Stdlib/index.html#type-in_channel">in_channel</a></span></code></li><li id="type-src.String" class="def variant constructor anchored"><a href="#type-src.String" class="anchor"></a><code><span>| </span><span>`String <span class="keyword">of</span> string</span></code></li><li id="type-src.Manual" class="def variant constructor anchored"><a href="#type-src.Manual" class="anchor"></a><code><span>| </span><span>`Manual</span></code></li></ol><code><span> ]</span></code></div><div class="spec-doc"><p>The type for input sources. With a <code>`Manual</code> source the client must provide input with <a href="Manual/index.html#val-src"><code>Manual.src</code></a>.</p></div></div><div class="odoc-spec"><div class="spec type anchored" id="type-nln"><a href="#type-nln" class="anchor"></a><code><span><span class="keyword">type</span> nln</span><span> = </span><span>[ </span></code><ol><li id="type-nln.ASCII" class="def variant constructor anchored"><a href="#type-nln.ASCII" class="anchor"></a><code><span>| </span><span>`ASCII <span class="keyword">of</span> <a href="../../ocaml/Stdlib/Uchar/index.html#type-t">Stdlib.Uchar.t</a></span></code></li><li id="type-nln.NLF" class="def variant constructor anchored"><a href="#type-nln.NLF" class="anchor"></a><code><span>| </span><span>`NLF <span class="keyword">of</span> <a href="../../ocaml/Stdlib/Uchar/index.html#type-t">Stdlib.Uchar.t</a></span></code></li><li id="type-nln.Readline" class="def variant constructor anchored"><a href="#type-nln.Readline" class="anchor"></a><code><span>| </span><span>`Readline <span class="keyword">of</span> <a href="../../ocaml/Stdlib/Uchar/index.html#type-t">Stdlib.Uchar.t</a></span></code></li></ol><code><span> ]</span></code></div><div class="spec-doc"><p>The type for newline normalizations. The variant argument is the normalization character.</p><ul><li><code>`ASCII</code>, normalizes CR (<code>U+000D</code>), LF (<code>U+000A</code>) and CRLF (<<code>U+000D</code>, <code>U+000A</code>>).</li><li><code>`NLF</code>, normalizes the Unicode newline function (NLF). This is NEL (<code>U+0085</code>) and the normalizations of <code>`ASCII</code>.</li><li><code>`Readline</code>, normalizes for a Unicode readline function. This is FF (<code>U+000C</code>), LS (<code>U+2028</code>), PS (<code>U+2029</code>), and the normalizations of <code>`NLF</code>.</li></ul><p>Used with an appropriate normalization character the <code>`NLF</code> and <code>`Readline</code> normalizations allow to implement all the different recommendations of Unicode's newline guidelines (section 5.8 in Unicode 9.0.0).</p></div></div><div class="odoc-spec"><div class="spec type anchored" id="type-decoder"><a href="#type-decoder" class="anchor"></a><code><span><span class="keyword">type</span> decoder</span></code></div><div class="spec-doc"><p>The type for decoders.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decoder"><a href="#val-decoder" class="anchor"></a><code><span><span class="keyword">val</span> decoder :
|
||
<span><span class="optlabel">?nln</span>:<span>[< <a href="#type-nln">nln</a> ]</span> <span class="arrow">-></span></span>
|
||
<span><span class="optlabel">?encoding</span>:<span>[< <a href="#type-decoder_encoding">decoder_encoding</a> ]</span> <span class="arrow">-></span></span>
|
||
<span><span>[< <a href="#type-src">src</a> ]</span> <span class="arrow">-></span></span>
|
||
<a href="#type-decoder">decoder</a></span></code></div><div class="spec-doc"><p><code>decoder nln encoding src</code> is a decoder that inputs from <code>src</code>.</p><p><b>Byte order mark.</b> <a href="http://unicode.org/glossary/#byte_order_mark">Byte order mark</a> (BOM) constraints are application dependent and prone to misunderstandings (see the <a href="http://www.unicode.org/faq/utf_bom.html#BOM">FAQ</a>). Hence, <code>Uutf</code> decoders have a simple rule: an <em>initial BOM is always removed from the input and not counted in character position tracking</em>. The function <a href="#val-decoder_removed_bom"><code>decoder_removed_bom</code></a> does however return <code>true</code> if a BOM was removed so that all the information can be recovered if needed.</p><p>For UTF-16BE and UTF-16LE the above rule is a violation of conformance D96 and D97 of the standard. <code>Uutf</code> favors the idea that if there's a BOM, decoding with <code>`UTF_16</code> or the <code>`UTF_16XX</code> corresponding to the BOM should decode the same character sequence (this is not the case if you stick to the standard). The client can however regain conformance by consulting the result of <a href="#val-decoder_removed_bom"><code>decoder_removed_bom</code></a> and take appropriate action.</p><p><b>Encoding.</b> <code>encoding</code> specifies the decoded encoding scheme. If <code>`UTF_16</code> is used the endianness is determined according to the standard: from a <a href="http://unicode.org/glossary/#byte_order_mark">BOM</a> if there is one, <code>`UTF_16BE</code> otherwise.</p><p>If <code>encoding</code> is unspecified it is guessed. The result of a guess can only be <code>`UTF_8</code>, <code>`UTF_16BE</code> or <code>`UTF_16LE</code>. The heuristic looks at the first three bytes of input (or less if impossible) and takes the <em>first</em> matching byte pattern in the table below.</p><pre>xx = any byte
|
||
.. = any byte or no byte (input too small)
|
||
pp = positive byte
|
||
uu = valid UTF-8 first byte
|
||
|
||
Bytes | Guess | Rationale
|
||
---------+-----------+-----------------------------------------------
|
||
EF BB BF | `UTF_8 | UTF-8 BOM
|
||
FE FF .. | `UTF_16BE | UTF-16BE BOM
|
||
FF FE .. | `UTF_16LE | UTF-16LE BOM
|
||
00 pp .. | `UTF_16BE | ASCII UTF-16BE and U+0000 is often forbidden
|
||
pp 00 .. | `UTF_16LE | ASCII UTF-16LE and U+0000 is often forbidden
|
||
uu .. .. | `UTF_8 | ASCII UTF-8 or valid UTF-8 first byte.
|
||
xx xx .. | `UTF_16BE | Not UTF-8 => UTF-16, no BOM => UTF-16BE
|
||
.. .. .. | `UTF_8 | Single malformed UTF-8 byte or no input.</pre><p>This heuristic is compatible both with BOM based recognitition and <a href="http://tools.ietf.org/html/rfc4627#section-3">JSON-like encoding recognition</a> that relies on ASCII being present at the beginning of the stream. Also, <a href="#val-decoder_removed_bom"><code>decoder_removed_bom</code></a> will tell the client if the guess was BOM based.</p><p><b>Newline normalization.</b> If <code>nln</code> is specified, the given newline normalization is performed, see <a href="#type-nln"><code>nln</code></a>. Otherwise all newlines are returned as found in the input.</p><p><b>Character position.</b> The line number, column number, byte count and character count of the last decoded character (including <code>`Malformed</code> ones) are respectively returned by <a href="#val-decoder_line"><code>decoder_line</code></a>, <a href="#val-decoder_col"><code>decoder_col</code></a>, <a href="#val-decoder_byte_count"><code>decoder_byte_count</code></a> and <a href="#val-decoder_count"><code>decoder_count</code></a>. Before the first call to <a href="#val-decode"><code>decode</code></a> the line number is <code>1</code> and the column is <code>0</code>. Each <a href="#val-decode"><code>decode</code></a> returning <code>`Uchar</code> or <code>`Malformed</code> increments the column until a newline. On a newline, the line number is incremented and the column set to zero. For example the line is <code>2</code> and column <code>0</code> after the first newline was decoded. This can be understood as if <a href="#val-decode"><code>decode</code></a> was moving an insertion point to the right in the data. A <em>newline</em> is anything normalized by <code>`Readline</code>, see <a href="#type-nln"><code>nln</code></a>.</p><p><code>Uutf</code> assumes that each Unicode scalar value has a column width of 1. The same assumption may not be made by the display program (e.g. for <code>emacs</code>' compilation mode you need to set <code>compilation-error-screen-columns</code> to <code>nil</code>). The problem is in general difficult to solve without interaction or convention with the display program's rendering engine. Depending on the context better column increments can be implemented by using <code>Uucp.Break.tty_width_hint</code> or <a href="http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries">grapheme cluster boundaries</a> (see <code>Uuseg</code>).</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decode"><a href="#val-decode" class="anchor"></a><code><span><span class="keyword">val</span> decode :
|
||
<span><a href="#type-decoder">decoder</a> <span class="arrow">-></span></span>
|
||
<span>[ `Await <span><span>| `Uchar</span> of <a href="../../ocaml/Stdlib/Uchar/index.html#type-t">Stdlib.Uchar.t</a></span> <span>| `End</span> <span><span>| `Malformed</span> of string</span> ]</span></span></code></div><div class="spec-doc"><p><code>decode d</code> is:</p><ul><li><code>`Await</code> if <code>d</code> has a <code>`Manual</code> input source and awaits for more input. The client must use <a href="Manual/index.html#val-src"><code>Manual.src</code></a> to provide it.</li><li><code>`Uchar u</code> if a Unicode scalar value <code>u</code> was decoded.</li><li><code>`End</code> if the end of input was reached.</li><li><code>`Malformed bytes</code> if the <code>bytes</code> sequence is malformed according to the decoded encoding scheme. If you are interested in a best-effort decoding you can still continue to decode after an error until the decoder synchronizes again on valid bytes. It may however be a good idea to signal the malformed characters by adding an <a href="#val-u_rep"><code>u_rep</code></a> character to the parsed data, see the <a href="#examples">examples</a>.</li></ul><p><b>Note.</b> Repeated invocation always eventually returns <code>`End</code>, even in case of errors.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decoder_encoding"><a href="#val-decoder_encoding" class="anchor"></a><code><span><span class="keyword">val</span> decoder_encoding : <span><a href="#type-decoder">decoder</a> <span class="arrow">-></span></span> <a href="#type-decoder_encoding">decoder_encoding</a></span></code></div><div class="spec-doc"><p><code>decoder_encoding d</code> is <code>d</code>'s the decoded encoding scheme of <code>d</code>.</p><p><b>Warning.</b> If the decoder guesses the encoding or uses <code>`UTF_16</code>, rely on this value only after the first <code>`Uchar</code> was decoded.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decoder_line"><a href="#val-decoder_line" class="anchor"></a><code><span><span class="keyword">val</span> decoder_line : <span><a href="#type-decoder">decoder</a> <span class="arrow">-></span></span> int</span></code></div><div class="spec-doc"><p><code>decoder_line d</code> is the line number of the last decoded (or malformed) character. See <a href="#val-decoder"><code>decoder</code></a> for details.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decoder_col"><a href="#val-decoder_col" class="anchor"></a><code><span><span class="keyword">val</span> decoder_col : <span><a href="#type-decoder">decoder</a> <span class="arrow">-></span></span> int</span></code></div><div class="spec-doc"><p><code>decoder_col d</code> is the column number of the last decoded (or malformed) character. See <a href="#val-decoder"><code>decoder</code></a> for details.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decoder_byte_count"><a href="#val-decoder_byte_count" class="anchor"></a><code><span><span class="keyword">val</span> decoder_byte_count : <span><a href="#type-decoder">decoder</a> <span class="arrow">-></span></span> int</span></code></div><div class="spec-doc"><p><code>decoder_byte_count d</code> is the number of bytes already decoded on <code>d</code> (including malformed ones). This is the last <a href="#val-decode"><code>decode</code></a>'s end byte offset counting from the beginning of the stream.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decoder_count"><a href="#val-decoder_count" class="anchor"></a><code><span><span class="keyword">val</span> decoder_count : <span><a href="#type-decoder">decoder</a> <span class="arrow">-></span></span> int</span></code></div><div class="spec-doc"><p><code>decoder_count d</code> is the number of characters already decoded on <code>d</code> (including malformed ones). See <a href="#val-decoder"><code>decoder</code></a> for details.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decoder_removed_bom"><a href="#val-decoder_removed_bom" class="anchor"></a><code><span><span class="keyword">val</span> decoder_removed_bom : <span><a href="#type-decoder">decoder</a> <span class="arrow">-></span></span> bool</span></code></div><div class="spec-doc"><p><code>decoder_removed_bom d</code> is <code>true</code> iff an <em>initial</em> <a href="http://unicode.org/glossary/#byte_order_mark">BOM</a> was removed from the input stream. See <a href="#val-decoder"><code>decoder</code></a> for details.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decoder_src"><a href="#val-decoder_src" class="anchor"></a><code><span><span class="keyword">val</span> decoder_src : <span><a href="#type-decoder">decoder</a> <span class="arrow">-></span></span> <a href="#type-src">src</a></span></code></div><div class="spec-doc"><p><code>decoder_src d</code> is <code>d</code>'s input source.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-decoder_nln"><a href="#val-decoder_nln" class="anchor"></a><code><span><span class="keyword">val</span> decoder_nln : <span><a href="#type-decoder">decoder</a> <span class="arrow">-></span></span> <span><a href="#type-nln">nln</a> option</span></span></code></div><div class="spec-doc"><p><code>decoder_nln d</code> returns <code>d</code>'s newline normalization (if any).</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-pp_decode"><a href="#val-pp_decode" class="anchor"></a><code><span><span class="keyword">val</span> pp_decode :
|
||
<span><a href="../../ocaml/Stdlib/Format/index.html#type-formatter">Stdlib.Format.formatter</a> <span class="arrow">-></span></span>
|
||
<span><span>[< `Await <span><span>| `Uchar</span> of <a href="../../ocaml/Stdlib/Uchar/index.html#type-t">Stdlib.Uchar.t</a></span> <span>| `End</span> <span><span>| `Malformed</span> of string</span> ]</span> <span class="arrow">-></span></span>
|
||
unit</span></code></div><div class="spec-doc"><p><code>pp_decode ppf v</code> prints an unspecified representation of <code>v</code> on <code>ppf</code>.</p></div></div><h2 id="encode"><a href="#encode" class="anchor"></a>Encode</h2><div class="odoc-spec"><div class="spec type anchored" id="type-dst"><a href="#type-dst" class="anchor"></a><code><span><span class="keyword">type</span> dst</span><span> = </span><span>[ </span></code><ol><li id="type-dst.Channel" class="def variant constructor anchored"><a href="#type-dst.Channel" class="anchor"></a><code><span>| </span><span>`Channel <span class="keyword">of</span> <a href="../../ocaml/Stdlib/index.html#type-out_channel">out_channel</a></span></code></li><li id="type-dst.Buffer" class="def variant constructor anchored"><a href="#type-dst.Buffer" class="anchor"></a><code><span>| </span><span>`Buffer <span class="keyword">of</span> <a href="../../ocaml/Stdlib/Buffer/index.html#type-t">Stdlib.Buffer.t</a></span></code></li><li id="type-dst.Manual" class="def variant constructor anchored"><a href="#type-dst.Manual" class="anchor"></a><code><span>| </span><span>`Manual</span></code></li></ol><code><span> ]</span></code></div><div class="spec-doc"><p>The type for output destinations. With a <code>`Manual</code> destination the client must provide output storage with <a href="Manual/index.html#val-dst"><code>Manual.dst</code></a>.</p></div></div><div class="odoc-spec"><div class="spec type anchored" id="type-encoder"><a href="#type-encoder" class="anchor"></a><code><span><span class="keyword">type</span> encoder</span></code></div><div class="spec-doc"><p>The type for Unicode encoders.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-encoder"><a href="#val-encoder" class="anchor"></a><code><span><span class="keyword">val</span> encoder : <span><span>[< <a href="#type-encoding">encoding</a> ]</span> <span class="arrow">-></span></span> <span><span>[< <a href="#type-dst">dst</a> ]</span> <span class="arrow">-></span></span> <a href="#type-encoder">encoder</a></span></code></div><div class="spec-doc"><p><code>encoder encoding dst</code> is an encoder for <code>encoding</code> that outputs to <code>dst</code>.</p><p><b>Note.</b> No initial <a href="http://unicode.org/glossary/#byte_order_mark">BOM</a> is encoded. If needed, this duty is left to the client.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-encode"><a href="#val-encode" class="anchor"></a><code><span><span class="keyword">val</span> encode :
|
||
<span><a href="#type-encoder">encoder</a> <span class="arrow">-></span></span>
|
||
<span><span>[< `Await <span>| `End</span> <span><span>| `Uchar</span> of <a href="../../ocaml/Stdlib/Uchar/index.html#type-t">Stdlib.Uchar.t</a></span> ]</span> <span class="arrow">-></span></span>
|
||
<span>[ `Ok <span>| `Partial</span> ]</span></span></code></div><div class="spec-doc"><p><code>encode e v</code> is :</p><ul><li><code>`Partial</code> iff <code>e</code> has a <code>`Manual</code> destination and needs more output storage. The client must use <a href="Manual/index.html#val-dst"><code>Manual.dst</code></a> to provide a new buffer and then call <a href="#val-encode"><code>encode</code></a> with <code>`Await</code> until <code>`Ok</code> is returned.</li><li><code>`Ok</code> when the encoder is ready to encode a new <code>`Uchar</code> or <code>`End</code></li></ul><p>For <code>`Manual</code> destination, encoding <code>`End</code> always returns <code>`Partial</code>, the client should continue as usual with <code>`Await</code> until <code>`Ok</code> is returned at which point <a href="Manual/index.html#val-dst_rem"><code>Manual.dst_rem</code></a> <code>e</code> is guaranteed to be the size of the last provided buffer (i.e. nothing was written).</p><p><b>Raises.</b> <code>Invalid_argument</code> if an <code>`Uchar</code> or <code>`End</code> is encoded after a <code>`Partial</code> encode.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-encoder_encoding"><a href="#val-encoder_encoding" class="anchor"></a><code><span><span class="keyword">val</span> encoder_encoding : <span><a href="#type-encoder">encoder</a> <span class="arrow">-></span></span> <a href="#type-encoding">encoding</a></span></code></div><div class="spec-doc"><p><code>encoder_encoding e</code> is <code>e</code>'s encoding.</p></div></div><div class="odoc-spec"><div class="spec value anchored" id="val-encoder_dst"><a href="#val-encoder_dst" class="anchor"></a><code><span><span class="keyword">val</span> encoder_dst : <span><a href="#type-encoder">encoder</a> <span class="arrow">-></span></span> <a href="#type-dst">dst</a></span></code></div><div class="spec-doc"><p><code>encoder_dst e</code> is <code>e</code>'s output destination.</p></div></div><h2 id="manual"><a href="#manual" class="anchor"></a>Manual sources and destinations.</h2><div class="odoc-spec"><div class="spec module anchored" id="module-Manual"><a href="#module-Manual" class="anchor"></a><code><span><span class="keyword">module</span> <a href="Manual/index.html">Manual</a></span><span> : <span class="keyword">sig</span> ... <span class="keyword">end</span></span></code></div><div class="spec-doc"><p>Manual sources and destinations.</p></div></div><h2 id="strbuf"><a href="#strbuf" class="anchor"></a>String folders and Buffer encoders</h2><div class="odoc-spec"><div class="spec module anchored" id="module-String"><a href="#module-String" class="anchor"></a><code><span><span class="keyword">module</span> <a href="String/index.html">String</a></span><span> : <span class="keyword">sig</span> ... <span class="keyword">end</span></span></code></div><div class="spec-doc"><p>Fold over the characters of UTF encoded OCaml <code>string</code> values.</p></div></div><div class="odoc-spec"><div class="spec module anchored" id="module-Buffer"><a href="#module-Buffer" class="anchor"></a><code><span><span class="keyword">module</span> <a href="Buffer/index.html">Buffer</a></span><span> : <span class="keyword">sig</span> ... <span class="keyword">end</span></span></code></div><div class="spec-doc"><p>UTF encode characters in OCaml <code>Buffer.t</code> values.</p></div></div><h2 id="examples"><a href="#examples" class="anchor"></a>Examples</h2><h3 id="readlines"><a href="#readlines" class="anchor"></a>Read lines</h3><p>The value of <code>lines src</code> is the list of lines in <code>src</code> as UTF-8 encoded OCaml strings. Line breaks are determined according to the recommendation R4 for a <code>readline</code> function in section 5.8 of Unicode 9.0.0. If a decoding error occurs we silently replace the malformed sequence by the replacement character <a href="#val-u_rep"><code>u_rep</code></a> and continue.</p><pre class="language-ocaml"><code>let lines ?encoding (src : [`Channel of in_channel | `String of string]) =
|
||
let rec loop d buf acc = match Uutf.decode d with
|
||
| `Uchar u ->
|
||
begin match Uchar.to_int u with
|
||
| 0x000A ->
|
||
let line = Buffer.contents buf in
|
||
Buffer.clear buf; loop d buf (line :: acc)
|
||
| _ ->
|
||
Uutf.Buffer.add_utf_8 buf u; loop d buf acc
|
||
end
|
||
| `End -> List.rev (Buffer.contents buf :: acc)
|
||
| `Malformed _ -> Uutf.Buffer.add_utf_8 buf Uutf.u_rep; loop d buf acc
|
||
| `Await -> assert false
|
||
in
|
||
let nln = `Readline (Uchar.of_int 0x000A) in
|
||
loop (Uutf.decoder ~nln ?encoding src) (Buffer.create 512) []</code></pre><p>Using the <code>`Manual</code> interface, <code>lines_fd</code> does the same but on a Unix file descriptor.</p><pre class="language-ocaml"><code>let lines_fd ?encoding (fd : Unix.file_descr) =
|
||
let rec loop fd s d buf acc = match Uutf.decode d with
|
||
| `Uchar u ->
|
||
begin match Uchar.to_int u with
|
||
| 0x000A ->
|
||
let line = Buffer.contents buf in
|
||
Buffer.clear buf; loop fd s d buf (line :: acc)
|
||
| _ ->
|
||
Uutf.Buffer.add_utf_8 buf u; loop fd s d buf acc
|
||
end
|
||
| `End -> List.rev (Buffer.contents buf :: acc)
|
||
| `Malformed _ -> Uutf.Buffer.add_utf_8 buf Uutf.u_rep; loop fd s d buf acc
|
||
| `Await ->
|
||
let rec unix_read fd s j l = try Unix.read fd s j l with
|
||
| Unix.Unix_error (Unix.EINTR, _, _) -> unix_read fd s j l
|
||
in
|
||
let rc = unix_read fd s 0 (Bytes.length s) in
|
||
Uutf.Manual.src d s 0 rc; loop fd s d buf acc
|
||
in
|
||
let s = Bytes.create 65536 (* UNIX_BUFFER_SIZE in 4.0.0 *) in
|
||
let nln = `Readline (Uchar.of_int 0x000A) in
|
||
loop fd s (Uutf.decoder ~nln ?encoding `Manual) (Buffer.create 512) []</code></pre><h3 id="recode"><a href="#recode" class="anchor"></a>Recode</h3><p>The result of <code>recode src out_encoding dst</code> has the characters of <code>src</code> written on <code>dst</code> with encoding <code>out_encoding</code>. If a decoding error occurs we silently replace the malformed sequence by the replacement character <a href="#val-u_rep"><code>u_rep</code></a> and continue. Note that we don't add an initial <a href="http://unicode.org/glossary/#byte_order_mark">BOM</a> to <code>dst</code>, recoding will thus loose the initial BOM <code>src</code> may have. Whether this is a problem or not depends on the context.</p><pre class="language-ocaml"><code>let recode ?nln ?encoding out_encoding
|
||
(src : [`Channel of in_channel | `String of string])
|
||
(dst : [`Channel of out_channel | `Buffer of Buffer.t])
|
||
=
|
||
let rec loop d e = match Uutf.decode d with
|
||
| `Uchar _ as u -> ignore (Uutf.encode e u); loop d e
|
||
| `End -> ignore (Uutf.encode e `End)
|
||
| `Malformed _ -> ignore (Uutf.encode e (`Uchar Uutf.u_rep)); loop d e
|
||
| `Await -> assert false
|
||
in
|
||
let d = Uutf.decoder ?nln ?encoding src in
|
||
let e = Uutf.encoder out_encoding dst in
|
||
loop d e</code></pre><p>Using the <code>`Manual</code> interface, <code>recode_fd</code> does the same but between Unix file descriptors.</p><pre class="language-ocaml"><code>let recode_fd ?nln ?encoding out_encoding
|
||
(fdi : Unix.file_descr)
|
||
(fdo : Unix.file_descr)
|
||
=
|
||
let rec encode fd s e v = match Uutf.encode e v with `Ok -> ()
|
||
| `Partial ->
|
||
let rec unix_write fd s j l =
|
||
let rec write fd s j l = try Unix.single_write fd s j l with
|
||
| Unix.Unix_error (Unix.EINTR, _, _) -> write fd s j l
|
||
in
|
||
let wc = write fd s j l in
|
||
if wc < l then unix_write fd s (j + wc) (l - wc) else ()
|
||
in
|
||
unix_write fd s 0 (Bytes.length s - Uutf.Manual.dst_rem e);
|
||
Uutf.Manual.dst e s 0 (Bytes.length s);
|
||
encode fd s e `Await
|
||
in
|
||
let rec loop fdi fdo ds es d e = match Uutf.decode d with
|
||
| `Uchar _ as u -> encode fdo es e u; loop fdi fdo ds es d e
|
||
| `End -> encode fdo es e `End
|
||
| `Malformed _ -> encode fdo es e (`Uchar Uutf.u_rep); loop fdi fdo ds es d e
|
||
| `Await ->
|
||
let rec unix_read fd s j l = try Unix.read fd s j l with
|
||
| Unix.Unix_error (Unix.EINTR, _, _) -> unix_read fd s j l
|
||
in
|
||
let rc = unix_read fdi ds 0 (Bytes.length ds) in
|
||
Uutf.Manual.src d ds 0 rc; loop fdi fdo ds es d e
|
||
in
|
||
let ds = Bytes.create 65536 (* UNIX_BUFFER_SIZE in 4.0.0 *) in
|
||
let es = Bytes.create 65536 (* UNIX_BUFFER_SIZE in 4.0.0 *) in
|
||
let d = Uutf.decoder ?nln ?encoding `Manual in
|
||
let e = Uutf.encoder out_encoding `Manual in
|
||
Uutf.Manual.dst e es 0 (Bytes.length es);
|
||
loop fdi fdo ds es d e</code></pre></div></body></html>
|