[go: up one dir, main page]

File: utftrip.ml

package info (click to toggle)
uutf 1.0.4-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 220 kB
  • sloc: ml: 1,584; makefile: 29; sh: 16
file content (391 lines) | stat: -rw-r--r-- 14,553 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
(*---------------------------------------------------------------------------
   Copyright (c) 2012 The uutf programmers. All rights reserved.
   SPDX-License-Identifier: ISC
  ---------------------------------------------------------------------------*)

let str = Printf.sprintf
let pp = Format.fprintf
let pp_pos ppf d = pp ppf "%d.%d:(%d,%06X) "
  (Uutf.decoder_line d) (Uutf.decoder_col d) (Uutf.decoder_count d)
  (Uutf.decoder_byte_count d)

let pp_decode inf d ppf v =
  pp ppf "@[<h>%s:%a%a@]@\n" inf pp_pos d Uutf.pp_decode v

let exec = Filename.basename Sys.executable_name
let log f = Format.eprintf ("%s: " ^^ f ^^ "@?") exec

let input_malformed = ref false
let log_malformed inf d v =
  input_malformed := true; log "%a" (pp_decode inf d) v

(* IO tools  *)

let io_buffer_size = 65536                          (* IO_BUFFER_SIZE 4.0.0 *)
let unix_buffer_size = 65536                      (* UNIX_BUFFER_SIZE 4.0.0 *)

let rec unix_read fd s j l = try Unix.read fd s j l with
| Unix.Unix_error (Unix.EINTR, _, _) -> unix_read fd s j l

let rec unix_write fd s j l =
  let rec write fd s j l = try Unix.single_write fd s j l with
  | Unix.Unix_error (Unix.EINTR, _, _) -> write fd s j l
  in
  let wc = write fd s j l in
  if wc < l then unix_write fd s (j + wc) (l - wc) else ()

let string_of_channel use_unix ic =
  let b = Buffer.create unix_buffer_size in
  let input, s =
    if use_unix
    then unix_read (Unix.descr_of_in_channel ic), Bytes.create unix_buffer_size
    else input ic, Bytes.create io_buffer_size
  in
  let rec loop b input s =
    let rc = input s 0 (Bytes.length s) in
    if rc = 0 then Buffer.contents b else
    (Buffer.add_substring b (Bytes.unsafe_to_string s) 0 rc; loop b input s)
  in
  loop b input s

let string_to_channel use_unix oc s =
  if not use_unix then output_string oc s else
  let s = Bytes.unsafe_of_string s in
  unix_write (Unix.descr_of_out_channel oc) s 0 (Bytes.length s)

let dst_for sout = if sout then `Buffer (Buffer.create 512) else `Channel stdout
let src_for inf sin use_unix =
  try
    let ic = if inf = "-" then stdin else open_in inf in
    if sin then `String (string_of_channel use_unix ic) else `Channel ic
  with Sys_error e -> log "%s\n" e; exit 1

let close_src src =
  try match src with `Channel ic when ic <> stdin -> close_in ic | _ -> () with
  | Sys_error e -> log "%s\n" e; exit 1

let src_for_unix inf =
  try if inf = "-" then Unix.stdin else Unix.(openfile inf [O_RDONLY] 0) with
  | Unix.Unix_error (e, _, v) -> log "%s: %s\n" (Unix.error_message e) v; exit 1

let close_src_unix fd = try if fd <> Unix.stdin then Unix.close fd with
| Unix.Unix_error (e, _, v) -> log "%s: %s\n" (Unix.error_message e) v; exit 1

let rec encode_unix fd s e v = match Uutf.encode e v with `Ok -> ()
| `Partial ->
    unix_write fd s 0 (Bytes.length s - Uutf.Manual.dst_rem e);
    Uutf.Manual.dst e s 0 (Bytes.length s);
    encode_unix fd s e `Await

(* Dump *)

let dump_decode inf d v =
  (match v with `Malformed _ -> input_malformed := true | _ -> ());
  (pp_decode inf d) Format.std_formatter v

let dump_ inf encoding nln src =
  let rec loop inf d = match Uutf.decode d with `Await -> assert false
  | v ->
      dump_decode inf d v;
      if v <> `End then loop inf d
  in
  loop inf (Uutf.decoder ?nln ?encoding src)

let dump_unix inf encoding nln usize fd =
  let rec loop fd s d = match Uutf.decode d with
  | `Await ->
      let rc = unix_read fd s 0 (Bytes.length s) in
      Uutf.Manual.src d s 0 rc; loop fd s d
  | v -> dump_decode inf d v; if v <> `End then loop fd s d
  in
  loop fd (Bytes.create usize) (Uutf.decoder ?nln ?encoding `Manual)

let dump inf sin use_unix usize ie nln =
  if sin || not use_unix then dump_ inf ie nln (src_for inf sin use_unix) else
  dump_unix inf ie nln usize (src_for_unix inf)

(* Guess only *)

let guess inf =
  let d = Uutf.decoder (src_for inf false false) in
  ignore (Uutf.decode d);
  Format.printf "%s@." (Uutf.encoding_to_string (Uutf.decoder_encoding d))

(* Decode only *)

let decode_ inf encoding nln src =
  let malformed = log_malformed inf in
  let rec loop d = match Uutf.decode d with `Await -> assert false
  | `Uchar _ -> loop d
  | `End -> ()
  | `Malformed _ as v -> malformed d v; loop d
  in
  loop (Uutf.decoder ?nln ?encoding src); close_src src

let decode_unix inf encoding nln usize fd =
  let malformed = log_malformed inf in
  let rec loop fd s d = match Uutf.decode d with
  | `Uchar _ -> loop fd s d
  | `End -> ()
  | `Malformed _ as v -> malformed d v; loop fd s d
  | `Await ->
      let rc = unix_read fd s 0 (Bytes.length s) in
      Uutf.Manual.src d s 0 rc; loop fd s d
  in
  loop fd (Bytes.create usize) (Uutf.decoder ?nln ?encoding `Manual);
  close_src_unix fd

let decode inf sin use_unix usize ie nln =
  if sin || not use_unix then decode_ inf ie nln (src_for inf sin use_unix) else
  decode_unix inf ie nln usize (src_for_unix inf)

(* Random encode only *)

let u_surrogate_count = 0xDFFF - 0xD800 + 1
let uchar_count = (0x10FFFF + 1) - u_surrogate_count
let r_uchar () =
  let n = Random.int uchar_count in
  Uchar.of_int (if n > 0xD7FF then n + u_surrogate_count else n)

let r_text encoding encode_f rcount =
  encode_f (`Uchar Uutf.u_bom);
  for i = 1 to rcount do encode_f (`Uchar (r_uchar ())) done;
  encode_f `End

let encode_f encoding dst =
  let e = Uutf.encoder encoding dst in
  fun v -> match Uutf.encode e v with `Ok -> () | `Partial -> assert false

let encode_f_unix usize encoding fd =
  let e, s = Uutf.encoder encoding `Manual, Bytes.create usize in
  Uutf.Manual.dst e s 0 (Bytes.length s);
  encode_unix fd s e

let r_encode sout use_unix usize rseed rcount oe =
  let rseed = match rseed with
  | None -> Random.self_init (); Random.int (1 lsl 30 - 1)
  | Some rseed -> rseed
  in
  let dst = dst_for sout in
  let oe = match oe with None -> `UTF_8 | Some enc -> enc in
  let encode_f =
    if sout || not use_unix then encode_f oe dst else
    encode_f_unix usize oe Unix.stdout
  in
  log "Encoding %d random characters with seed %d\n" rcount rseed;
  Random.init rseed; r_text oe encode_f rcount;
  match dst with `Channel _ -> ()
  | `Buffer b -> string_to_channel use_unix stdout (Buffer.contents b)

(* Trip *)

let trip_ inf nln ie oe src dst =
  let malformed d v e =
    log_malformed inf d v; ignore (Uutf.encode e (`Uchar Uutf.u_rep))
  in
  let rec loop d e = function `Await -> assert false
  | `Uchar _ as v -> ignore (Uutf.encode e v); loop d e (Uutf.decode d)
  | `End -> ignore (Uutf.encode e `End)
  | `Malformed _ as v -> malformed d v e; loop d e (Uutf.decode d)
  in
  let d = Uutf.decoder ?nln ?encoding:ie src in
  let e, first = match oe with
  | Some enc -> Uutf.encoder enc dst, (Uutf.decode d)
  | None ->
      let v = Uutf.decode d in                          (* get the encoding. *)
      let enc = match Uutf.decoder_encoding d with
      | #Uutf.encoding as enc -> enc | `ISO_8859_1 | `US_ASCII -> `UTF_8
      in
      Uutf.encoder enc dst, v
  in
  if (Uutf.encoder_encoding e = `UTF_16 || Uutf.decoder_removed_bom d)
  then ignore (Uutf.encode e (`Uchar Uutf.u_bom));
  loop d e first; close_src src

let trip_unix inf usize nln ie oe fdi fdo =
  let malformed  d v e =
    log_malformed inf d v; ignore (Uutf.encode e (`Uchar Uutf.u_rep))
  in
  let rec loop fdi fdo ds es d e = function
  | `Uchar _ as v ->
      encode_unix fdo es e v; loop fdi fdo ds es d e (Uutf.decode d)
  | `End -> encode_unix fdo es e `End
  | `Malformed _ as v -> malformed d v e; loop fdi fdo ds es d e (Uutf.decode d)
  | `Await ->
      let rc = unix_read fdi ds 0 (Bytes.length ds) in
      Uutf.Manual.src d ds 0 rc; loop fdi fdo ds es d e (Uutf.decode d)
  in
  let d, ds = Uutf.decoder ?nln ?encoding:ie `Manual, Bytes.create usize in
  let e, es, first = match oe with
  | Some enc -> Uutf.encoder enc `Manual, Bytes.create usize, (Uutf.decode d)
  | None ->
      let rec decode_past_await d = match Uutf.decode d with
      | `Await ->
          let rc = unix_read fdi ds 0 (Bytes.length ds) in
          Uutf.Manual.src d ds 0 rc; decode_past_await d
      | v -> v
      in
      let v = decode_past_await d in                        (* get encoding. *)
      let enc = match Uutf.decoder_encoding d with
      | #Uutf.encoding as enc -> enc | `ISO_8859_1 | `US_ASCII -> `UTF_8
      in
      Uutf.encoder enc `Manual, Bytes.create usize, v
  in
  Uutf.Manual.dst e es 0 (Bytes.length es);
  if (Uutf.encoder_encoding e = `UTF_16 || Uutf.decoder_removed_bom d)
  then encode_unix fdo es e (`Uchar Uutf.u_bom);
  loop fdi fdo ds es d e first; close_src_unix fdi

let trip inf sin sout use_unix usize ie oe nln =
  let src = src_for inf sin use_unix in
  let dst = dst_for sout in
  if sin || sout || not use_unix then trip_ inf nln ie oe src dst else
  trip_unix inf usize nln ie oe (src_for_unix inf) Unix.stdout;
  match dst with `Channel _ -> ()
  | `Buffer b -> string_to_channel use_unix stdout (Buffer.contents b)

(* Cmd *)

let do_cmd cmd inf sin sout use_unix usize ie oe nln rseed rcount =
  match cmd with
  | `Ascii -> dump inf sin use_unix usize ie nln
  | `Guess -> guess inf
  | `Decode -> decode inf sin use_unix usize ie nln
  | `Encode -> r_encode sout use_unix usize rseed rcount oe
  | `Trip -> trip inf sin sout use_unix usize ie oe nln

(* Cmdline interface *)

open Cmdliner

let enc_enum =
  [ "UTF-8", `UTF_8; "UTF-16", `UTF_16; "UTF-16LE", `UTF_16LE;
    "UTF-16BE", `UTF_16BE; ]

let decode_enc_enum =
  ("ASCII", `US_ASCII) :: ("latin1", `ISO_8859_1) :: enc_enum

let ienc =
  let doc = str "Decoded (input) encoding, must %s. If unspecified the
                 encoding is guessed."
                (Arg.doc_alts_enum decode_enc_enum)
  in
  Arg.(value & opt (some (enum decode_enc_enum)) None &
       info ["d"; "input-encoding"] ~doc)

let oenc =
  let doc = str "Encoded (output) encoding, must %s. If unspecified the output
                 encoding is the same as the input encoding except for ASCII
                 and latin1 where UTF-8 is output." (Arg.doc_alts_enum enc_enum)
  in
  Arg.(value & opt (some (enum enc_enum)) None &
       info ["e"; "output-encoding"] ~doc)

let nln =
  let lf = Uchar.of_int 0x000A in
  let nln_enum = ["ascii", `ASCII lf; "nlf", `NLF lf; "readline", `Readline lf]
  in
  let doc = str "New line normalization to U+000A, must %s. ascii
                 normalizes CR (U+000D) and CRLF (<U+000D, U+000A>). nlf
                 normalizes like ascii plus NEL (U+0085). readline
                 normalizes like nlf plus FF (U+000C), LS (U+2028) and
                 PS (U+2029)."
      (Arg.doc_alts_enum nln_enum)
  in
  let vopt = Some (`Readline lf) in
  Arg.(value & opt ~vopt (some (enum nln_enum)) None & info ["nln"] ~doc)

let sin =
  let doc = "Input everything in a string and decode the string." in
  Arg.(value & flag & info [ "input-string" ] ~doc)

let sout =
  let doc = "Encode everything in a string and output the string." in
  Arg.(value & flag & info [ "output-string" ] ~doc)

let use_unix =
  let doc = "Use Unix IO." in
  Arg.(value & flag & info [ "use-unix" ] ~doc)

let usize =
  let doc = "Unix IO buffer sizes in bytes." in
  Arg.(value & opt int unix_buffer_size & info ["unix-size"] ~doc)

let nat =
  let parse s =
    try
      let v = int_of_string s in
      if v > 0 then Ok v else failwith (str "%s must be > 0" s)
    with Failure e -> Error e
  in
  Arg.conv' ~docv:"NAT" (parse, Format.pp_print_int)

let rseed =
  let doc = "Random seed." in
  Arg.(value & opt (some nat) None & info ["rseed"] ~doc)

let rcount =
  let doc = "Number of random characters to generate." in
  Arg.(value & opt nat 1_000_000 & info ["rcount"] ~doc)

let file =
  let doc = "The input file. Reads from stdin if unspecified." in
  Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE")

let cmd =
  let doc = "Output the input text as Unicode scalar values or malformed
             sequences, one per line, in the US-ASCII charset with their
             position (see POSITION INFORMATION for more details)."
  in
  let ascii = `Ascii, Arg.info ["a"; "ascii"] ~doc in
  let doc = "Only guess an UTF encoding. The result of a guess can only be
             UTF-8 or UTF-16{LE,BE}."
  in
  let guess = `Guess, Arg.info ["g"; "guess"] ~doc in
  let doc = "Decode only, no encoding." in
  let dec = `Decode, Arg.info ["decode"] ~doc in
  let doc = "Encode only (random), no decoding. See option $(b,--rcount)." in
  let enc = `Encode, Arg.info ["encode"] ~doc in
  Arg.(value & vflag `Trip [ascii; guess; dec; enc])

let cmd =
  let doc = "Recode UTF-{8,16,16LE,16BE} and latin1 from stdin to stdout." in
  let man = [
    `S "DESCRIPTION";
    `P "$(tname) inputs Unicode text from stdin and rewrites it
        to stdout in various ways. If no input encoding is specified,
        it is guessed. If no output encoding is specified, the input
        encoding is used.";
    `P "Invalid byte sequences in the input are reported on stderr and
        replaced by the Unicode replacement character (U+FFFD) in the output.";
    `S "POSITION INFORMATION";
    `P "The format for position information is:";
    `P "filename:line.col:(count,byte)";
    `I ("line", "one-based line number that increments with each newline.
        A newline is always determined as being anything that would be
        normalized by the option `$(b,--nln)=readline`.");
    `I ("col", "zero-based column number that increment with each new
        decoded character and zeroes after a newline
        is decoded. Note that the column number may not correspond to
        user-perceived columns, as any Unicode scalar value, including
        combining characters are deemed to have a width of 1.");
    `I ("count", "the one-based Unicode scalar value count.");
    `I ("byte", "the zero-based end byte offset of the scalar value
                 in the input stream in hexadecimal.");
    `S "EXIT STATUS";
    `P "$(tname) exits with one of the following values:";
    `I ("0", "no error occured");
    `I ("1", "a command line parsing error occured");
    `I ("2", "the input text was malformed");
    `S "BUGS";
    `P "This program is distributed with the Uutf OCaml library.
        See http://erratique.ch/software/uutf for contact
        information."; ]
  in
  Cmd.v (Cmd.info "utftrip" ~version:"v1.0.4" ~doc ~man)
    Term.(const do_cmd $ cmd $ file $ sin $ sout $ use_unix $ usize $
          ienc $ oenc $ nln $ rseed $ rcount)

let () = match Cmd.eval_value cmd with
| Error _ -> exit 1
| _ -> if !input_malformed then exit 2 else exit 0