-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil_text.ml
157 lines (134 loc) · 4.2 KB
/
util_text.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
(*
Text-manipulation utilities
TODO: merge with Util_string.
*)
open Printf
let ws_re = Pcre.regexp "[ \r\t\n]+"
let split_ws s = Pcre.split ~rex:ws_re s
let line_sep = Pcre.regexp "\r*\n"
let split_lines s =
Pcre.split ~rex:line_sep s
let is_whitespace =
let rex = Pcre.regexp "\\A[ \r\t\n]*\\z" in
fun s -> Pcre.pmatch ~rex s
let test_is_whitespace () =
assert (is_whitespace "");
assert (is_whitespace "\n");
assert (is_whitespace " \n ");
assert (is_whitespace " \n\n");
assert (not (is_whitespace "a\n"));
assert (not (is_whitespace "\na\n"));
true
let string_of_string_option =
function None -> "None" | Some s -> Printf.sprintf "%s" s
let id x = x
let string_of_list f l = "[" ^ (String.concat "; " (List.map f l)) ^ "]"
let string_of_pair f g (x,y) = Printf.sprintf "(%s, %s)" (f x) (g y)
let string_of_string_pair (s1,s2) = Printf.sprintf "-H \"%s: %s\"" s1 s2
let string_of_string_pair_list = string_of_list string_of_string_pair
let validate_utf8 path s =
if Utf8val.is_allowed_unicode s then None
else
let msg =
Printf.sprintf "Malformed UTF-8: %S"
(if String.length s <= 100 then s
else String.sub s 0 100 ^ " ...")
in
Some (Ag_util.Validation.error ~msg path)
let default_dots = " ..."
(* FIXME: make sure to not cut in the middle of a UTF-8 character. *)
let elliptify ?(dots = default_dots) n s =
let suflen = String.length dots in
let n = max suflen n in
let len = String.length s in
if len <= n then s
else
String.sub s 0 (n - suflen) ^ dots
(*
Truncate some text to a maximum number of lines and a maximum number
of bytes, by truncating the last line if needed.
*)
let truncate ?(dots = default_dots) ~max_lines ~max_bytes s =
let concat lines = String.concat "\n" (List.rev lines) in
let rec loop out_lines line_count byte_count in_lines =
match in_lines with
| [] ->
concat out_lines
| line :: more_lines ->
if line_count < max_lines then
let len = String.length line in
let n = byte_count + len in
if n > max_bytes then (
let max_line_length = max_bytes - byte_count in
let out_line = elliptify ~dots max_line_length line in
let out_lines = out_line :: out_lines in
concat out_lines
)
else (
let out_lines = line :: out_lines in
loop out_lines (line_count + 1) n more_lines
)
else
concat (dots :: out_lines)
in
let in_lines = split_lines s in
loop [] 0 0 in_lines
let test_truncate () =
let f s = truncate ~max_lines:2 ~max_bytes:8 s in
assert (f "" = "");
assert (f "123456789" = "1234 ...");
assert (f "1\n2\n3" = "1\n2\n ...");
true
(*
Function called by each *_of_string function generated by:
atdgen -j -j-pp Util_text.pp_utf8
It replaces code points in forbidden Unicode areas by a safe character
(even if they are \u escaped)
*)
let pp_utf8 s =
if Utf8val.is_json_compatible s then s
else
match Utf8val.fix_json_compatible s with
Some s -> s
| None ->
failwith (
sprintf "Invalid UTF8-JSON: %S\n" (elliptify 300 s)
)
let looks_binary s =
let len = String.length s in
let control = ref 0 in
for i = 0 to len - 1 do
if Char.code s.[i] < 32 then
incr control
done;
float !control /. float len >= 0.10
let email_address_regex = Pcre.regexp "[^@]+@[^@]+\\.[^@]+"
let looks_like_email_address s = Pcre.pmatch ~rex:email_address_regex s
let string_of_binary ?(maxlen = 60) s =
let len = String.length s in
if len <= maxlen then
sprintf "[binary data; length: %i]: %S" len s
else
sprintf "[binary data; length: %i]: %S..." len (String.sub s 0 maxlen)
let loggable s =
if looks_binary s then
string_of_binary s
else
s
let prettify s =
try "JSON\n" ^ Yojson.Basic.prettify s ^ "\n"
with _ ->
if looks_binary s then
string_of_binary s
else
s
let tests = [
"is_whitespace", test_is_whitespace;
"utf8 (ascii)",
(fun () -> Utf8val.is_utf8 "abc");
"utf8 (byte128)",
(fun () -> not (Utf8val.is_utf8 "\128"));
"utf8 (byte128 in a longer string)",
(fun () -> not (Utf8val.is_utf8 "a\128b"));
"truncate", test_truncate;
]