-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutf16.m
1283 lines (1150 loc) · 66.4 KB
/
utf16.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@import Cocoa ;
@import LuaSkin ;
#import "text.h"
/// === hs.text.utf16 ===
///
/// Perform text manipulation on UTF16 objects created by the `hs.text` module.
///
/// This sumodule replicates many of the functions found in the lua `string` and `utf8` libraries but modified for use with UTF16 text objects.
///
/// Metamethods to make the objects work more like Lua strings:
///
/// * unlike most userdata objects used by Hammerspoon modules, `hs.text.utf16` objects have their `__tostring` metamethod defined to return the UTF8 equivalent of the object. This allows the object to be printed to the Hammerspoon console directly with the lua `print` command (e.g. `print(object)`). You can also save the object as a lua string with `tostring(object)`.
/// * (in)equality -- the metamethods for equality and inequality use [hs.text.utf16:compare({"literal"})](#compate) when you use `==`, `~=`, `<`, `<=`, `>`, or `>=` to compare a `hs.text.utf16` to another or to a lua string.
/// * concatenation -- you can create a new `hs.utf16.text` objext by combining two objects (or one and a lua string) with `..`
///
/// Additional Notes
///
/// Internally, the macOS provides a wide range of functions for manipulating and managing UTF16 strings in the Objective-C runtime. While a wide variety of encodings can be used for importing and exporting data (see the main body of the `hs.text` module), string manipulation is provided by macOS only for the UTf16 representation of the encoded data. When working with data encoded in other formats, use the `hs.text:toUTF16()` method which will create an object his submodule can manipulate. When finished, you can convert the data back to the necessary encoding with the `hs.text.new()` function and then export the data back (e.g. writing to a file or posting to a URL).
///
/// In addition to the lua `string` and `utf8` functions, additional functions provided by the macOS are included. This includes, but is not limited to, Unicode normalization and ICU transforms.
static LSRefTable refTable = LUA_NOREF ;
#define get_objectFromUserdata(objType, L, idx, tag) (objType*)*((void**)luaL_checkudata(L, idx, tag))
#pragma mark - Support Functions and Classes
@implementation HSTextUTF16Object
- (instancetype)initWithString:(NSString *)string {
self = [super init] ;
if (self) {
_utf16string = string ;
_selfRefCount = 0 ;
}
return self ;
}
@end
BOOL inMiddleOfChar(NSString *string, NSUInteger idx, BOOL charactersComposed) {
if (idx == string.length) {
return NO ;
} else {
BOOL answer = (BOOL)(CFStringIsSurrogateLowCharacter([string characterAtIndex:idx])) ;
if (!answer && charactersComposed) {
NSRange range = [string rangeOfComposedCharacterSequenceAtIndex:idx] ;
answer = (idx != range.location) ;
}
return answer ;
}
}
#pragma mark - Module Functions
/// hs.text.utf16.new(text, [lossy]) -> utf16TextObject
/// Constructor
/// Create a new utf16TextObject from a lua string or `hs.text` object
///
/// Parameters:
/// * `text` - a lua string or `hs.text` object specifying the text for the new utf16TextObject
/// * `lossy` - an optional boolean, default `false`, specifying whether or not characters can be removed or altered when converting the data to the UTF16 encoding.
///
/// Returns:
/// * a new utf16TextObject, or nil if the data could not be encoded as a utf16TextObject
static int utf16_new(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
NSData *input = [NSData data] ;
BOOL lossy = (lua_gettop(L) > 1) ? (BOOL)(lua_toboolean(L, 2)) : NO ;
NSStringEncoding encoding = NSUTF8StringEncoding ;
if (lua_type(L, 1) == LUA_TLIGHTUSERDATA) {
// we're being called by another module's internal code to avoid passing the string through lua first
[skin checkArgs:LS_TANY, LS_TBOOLEAN | LS_TOPTIONAL, LS_TBREAK] ;
NSString *object = [NSString stringWithString:(__bridge NSString *)(lua_touserdata(L, 1))] ;
input = [object dataUsingEncoding:NSUnicodeStringEncoding allowLossyConversion:NO] ;
encoding = NSUnicodeStringEncoding ;
} else if (lua_type(L, 1) == LUA_TUSERDATA) {
[skin checkArgs:LS_TUSERDATA, USERDATA_TAG, LS_TBOOLEAN | LS_TOPTIONAL, LS_TBREAK] ;
HSTextObject *object = [skin toNSObjectAtIndex:1] ;
input = object.contents ;
encoding = object.encoding ;
// this submodule doesn't do raw data, so default to UTF8 if they didn't set/convert this with
// hs.text methods -- UTF8 is what Lua would give us if they used a string instead of hs.text
// as argument 1 anyways
if (encoding == 0) encoding = NSUTF8StringEncoding ;
} else {
[skin checkArgs:LS_TSTRING, LS_TBOOLEAN | LS_TOPTIONAL, LS_TBREAK] ;
input = [skin toNSObjectAtIndex:1 withOptions:LS_NSLuaStringAsDataOnly] ;
}
NSString *actualString = [[NSString alloc] initWithData:input encoding:encoding] ;
// if we don't allow lossy, verify string->data->string results in the same string
// should be more accurate than comparing data directly since we don't care about BOMs
// or byte-to-byte equivalence, we care about textual equivalence.
//
// TODO: Check to see if we need to worry about unicode normilization here... I don't
// *think* we do because we're comparing from a single source, so surrogates and order
// shouldn't change, but I don't know that for a fact...
if (!lossy) {
if (actualString) {
NSData *asData = [actualString dataUsingEncoding:encoding allowLossyConversion:NO] ;
if (asData) {
//
NSString *stringFromAsData = [[NSString alloc] initWithData:asData encoding:encoding] ;
if (![actualString isEqualToString:stringFromAsData]) actualString = nil ;
} else {
actualString = nil ;
}
}
}
HSTextUTF16Object *object = nil ;
if (actualString) object = [[HSTextUTF16Object alloc] initWithString:actualString] ;
[skin pushNSObject:object] ;
return 1 ;
}
// utf8.char (···)
//
// Receives zero or more integers, converts each one to its corresponding UTF-8 byte sequence and returns a string with the concatenation of all these sequences.
/// hs.text.utf16.char(...) -> utf16TextObject
/// Constructor
/// Create a new utf16TextObject from the Unicode Codepoints specified.
///
/// Paramters:
/// * zero or more Unicode Codepoints specified as integers
///
/// Returns:
/// * a new utf16TextObject
///
/// Notes:
/// * Unicode Codepoints are often written as `U+xxxx` where `xxxx` is between 4 and 6 hexadecimal digits. Lua can automatically convert hexadecimal numbers to integers, so replace the `U+` with `0x` when specifying codepoints in this format.
static int utf16_utf8_char(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
NSMutableString *newString = [NSMutableString stringWithCapacity:(NSUInteger)lua_gettop(L)] ;
for (int i = 1 ; i <= lua_gettop(L) ; i++) {
if (lua_type(L, i) != LUA_TNUMBER) return luaL_argerror(L, i, [[NSString stringWithFormat:@"number expected, got %s", luaL_typename(L, i)] UTF8String]) ;
if (!lua_isinteger(L, i)) return luaL_argerror(L, i, "number has no integer representation") ;
uint32_t codepoint = (uint32_t)lua_tointeger(L, i) ;
unichar surrogates[2] ;
if (CFStringGetSurrogatePairForLongCharacter(codepoint, surrogates)) {
[newString appendString:[NSString stringWithCharacters:surrogates length:2]] ;
} else {
unichar ch1 = (unichar)codepoint ;
[newString appendString:[NSString stringWithCharacters:&ch1 length:1]] ;
}
}
HSTextUTF16Object *object = [[HSTextUTF16Object alloc] initWithString:newString] ;
[skin pushNSObject:object] ;
return 1 ;
}
/// hs.text.utf16.isHighSurrogate(unitchar) -> boolean
/// Function
/// Returns whether or not the specified 16-bit UTF16 unit character is a High Surrogate
///
/// Parameters:
/// * `unitchar` - an integer specifying a single UTF16 character
///
/// Returns:
/// * a boolean specifying whether or not the single UTF16 character specified is a High Surrogate (true) or not (false).
///
/// Notes:
/// * UTF16 represents Unicode characters in the range of U+010000 to U+10FFFF as a pair of UTF16 characters known as a surrogate pair. A surrogate pair is made up of a High Surrogate and a Low Surrogate.
/// * A high surrogate is a single UTF16 "character" with an integer representation between 0xD800 and 0xDBFF inclusive
/// * A low surrogate is a single UTF16 "character" with an integer representation between 0xDC00 and 0xDFFF inclusive.
/// * It is an encoding error if a high surrogate is not immediately followed by a low surrogate or for either surrogate type to be found by itself or surrounded by UTF16 characters outside of the surrogate pair ranges. However, most implementations silently ignore this and simply treat unpaired surrogates as unprintable (control characters) or equivalent to the Unicode Replacement character (U+FFFD).
///
/// * See also [hs.text.utf16.isLowSurrogate](#isLowSurrogate)
static int utf16_isHighSurrogate(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TNUMBER | LS_TINTEGER, LS_TBREAK] ;
unichar ch = (unichar)lua_tointeger(L, 1) ;
lua_pushboolean(L, CFStringIsSurrogateHighCharacter(ch)) ;
return 1 ;
}
/// hs.text.utf16.isLowSurrogate(unitchar) -> boolean
/// Function
/// Returns whether or not the specified 16-bit UTF16 unit character is a Low Surrogate
///
/// Parameters:
/// * `unitchar` - an integer specifying a single UTF16 character
///
/// Returns:
/// * a boolean specifying whether or not the single UTF16 character specified is a Low Surrogate (true) or not (false).
///
/// Notes:
/// * UTF16 represents Unicode characters in the range of U+010000 to U+10FFFF as a pair of UTF16 characters known as a surrogate pair. A surrogate pair is made up of a High Surrogate and a Low Surrogate.
/// * A high surrogate is a single UTF16 "character" with an integer representation between 0xD800 and 0xDBFF inclusive
/// * A low surrogate is a single UTF16 "character" with an integer representation between 0xDC00 and 0xDFFF inclusive.
/// * It is an encoding error if a high surrogate is not immediately followed by a low surrogate or for either surrogate type to be found by itself or surrounded by UTF16 characters outside of the surrogate pair ranges. However, most implementations silently ignore this and simply treat unpaired surrogates as unprintable (control characters) or equivalent to the Unicode Replacement character (U+FFFD).
///
/// * See also [hs.text.utf16.isHighSurrogate](#isHighSurrogate)
static int utf16_isLowSurrogate(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TNUMBER | LS_TINTEGER, LS_TBREAK] ;
unichar ch = (unichar)lua_tointeger(L, 1) ;
lua_pushboolean(L, CFStringIsSurrogateLowCharacter(ch)) ;
return 1 ;
}
/// hs.text.utf16.surrogatePairForCodepoint(codepoint) -> integer, integer | nil
/// Function
/// Returns the surrogate pair for the specified Unicode Codepoint
///
/// Parameters:
/// * `codepoint` - an integer specifying the Unicode codepoint
///
/// Returns:
/// * if the codepoint is between U+010000 to U+10FFFF, returns the UTF16 surrogate pair for the character as 2 integers; otherwise returns nil
///
/// Notes:
/// * UTF16 represents Unicode characters in the range of U+010000 to U+10FFFF as a pair of UTF16 characters known as a surrogate pair. A surrogate pair is made up of a High Surrogate and a Low Surrogate.
///
/// * See also [hs.text.utf16.isHighSurrogate](#isHighSurrogate) and [hs.text.utf16.isLowSurrogate](#isLowSurrogate)
static int utf16_surrogatePairForCodepoint(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TNUMBER | LS_TINTEGER, LS_TBREAK] ;
uint32_t codepoint = (uint32_t)lua_tointeger(L, 1) ;
unichar surrogates[2] ;
if (CFStringGetSurrogatePairForLongCharacter(codepoint, surrogates)) {
lua_pushinteger(L, (lua_Integer)surrogates[0]) ;
lua_pushinteger(L, (lua_Integer)surrogates[1]) ;
return 2 ;
} else {
lua_pushnil(L) ;
return 1 ;
}
}
/// hs.text.utf16.codepointForSurrogatePair(high, low) -> integer | nil
/// Function
/// Returns the Unicode Codepoint number for the specified high and low surrogate pair
///
/// Parameters:
/// * `high` - an integer specifying the UTF16 "character" specifying the High Surrogate
/// * `low` - an integer specifying the UTF16 "character" specifying the Low Surrogate
///
/// Returns:
/// * if the `high` and `low` values specify a valid UTF16 surrogate pair, returns an integer specifying the codepoint for the pair; otherwise returns nil
///
/// Notes:
/// * UTF16 represents Unicode characters in the range of U+010000 to U+10FFFF as a pair of UTF16 characters known as a surrogate pair. A surrogate pair is made up of a High Surrogate and a Low Surrogate.
///
/// * See also [hs.text.utf16.isHighSurrogate](#isHighSurrogate) and [hs.text.utf16.isLowSurrogate](#isLowSurrogate)
static int utf16_codepointForSurrogatePair(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TNUMBER | LS_TINTEGER, LS_TNUMBER | LS_TINTEGER, LS_TBREAK] ;
unichar ch1 = (unichar)lua_tointeger(L, 1) ;
unichar ch2 = (unichar)lua_tointeger(L, 2) ;
if (CFStringIsSurrogateHighCharacter(ch1) && CFStringIsSurrogateLowCharacter(ch2)) {
uint32_t codepoint = CFStringGetLongCharacterForSurrogatePair(ch1, ch2) ;
lua_pushinteger(L, (lua_Integer)codepoint) ;
} else {
lua_pushnil(L) ;
}
return 1 ;
}
#pragma mark - Module Methods
/// hs.text.utf16:copy() -> utf16TextObject
/// Method
/// Create a copy of the utf16TextObject
///
/// Paramters:
/// * None
///
/// Returns:
/// * a copy of the utf16TextObject as a new object
static int utf16_copy(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
HSTextUTF16Object *newObject = [[HSTextUTF16Object alloc] initWithString:[objString copy]] ;
[skin pushNSObject:newObject] ;
return 1 ;
}
/// hs.text.utf16:transform(transform, [inverse]) -> utf16TextObject | nil
/// Method
/// Create a new utf16TextObject by applying the specified ICU transform
///
/// Paramters:
/// * `transform` - a string specifying the ICU transform(s) to apply
/// * `inverse` - an optional boolean, default `false`, specifying whether or not to apply the inverse (or reverse) of the specified transformation
///
/// Returns:
/// * a new utf16TextObject containing the transformed data, or nil if the transform (or its inverse) could not be applied or was invalid
///
/// Notes:
/// * some built in transforms are identified in the constant table [hs.text.utf16.builtinTransforms](#builtInTransforms).
/// * transform syntax is beyond the scope of this document; see http://userguide.icu-project.org/transforms/general for more information on creating your own transforms
///
/// * Note that not all transforms have an inverse or are reversible.
static int utf16_transform(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TSTRING, LS_TBOOLEAN | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
NSString *transform = [skin toNSObjectAtIndex:2] ;
BOOL reverse = (lua_gettop(L) == 3) ? (BOOL)(lua_toboolean(L, 3)) : NO ;
NSMutableString *resultString = [objString mutableCopy] ;
NSRange range = NSMakeRange(0, resultString.length) ;
NSRange resultingRange ;
BOOL success = [resultString applyTransform:transform
reverse:reverse
range:range
updatedRange:&resultingRange] ;
if (success) {
HSTextUTF16Object *newObject = [[HSTextUTF16Object alloc] initWithString:resultString] ;
[skin pushNSObject:newObject] ;
} else {
lua_pushnil(L) ;
}
return 1 ;
}
/// hs.text.utf16:unicodeDecomposition([compatibilityMapping]) -> utf16TextObject
/// Method
/// Create a new utf16TextObject with the contents of the parent normalized using Unicode Normalization Form (K)D.
///
/// Paramters:
/// * `compatibilityMapping` - an optionabl boolean, default `false`, specifying whether compatibility mapping (true) should be used (Normalization Form KD) or canonical mapping (false) should be used (Normalization Form D) when normalizing the text.
///
/// Returns:
/// * a new utf16TextObject with the contents of the parent normalized using Unicode NormalizationForm (K)D.
///
/// Notes:
/// * At its most basic, normalization is useful when comparing strings which may have been composed differently (e.g. a single UTF16 character representing an accented `á` vs the visually equivalent composed character sequence of an `a` followed by U+0301) or use stylized versions of characters or numbers (e.g. `1` vs `①`), but need to be compared for their "visual" or "intended" equivalance.
///
/// * see http://www.unicode.org/reports/tr15/ for a more complete discussion of the various types of Unicode Normalization and the differences/strengths/weaknesses of each.
///
/// * See also [hs.text.utf16:unicodeComposition](#unicodeComposition)
static int utf16_unicodeDecomposition(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TBOOLEAN | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
BOOL compatibility = (lua_gettop(L) > 1) ? (BOOL)(lua_toboolean(L, 2)) : NO ;
NSString *newString = compatibility ? objString.decomposedStringWithCompatibilityMapping
: objString.decomposedStringWithCanonicalMapping ;
HSTextUTF16Object *newObject = [[HSTextUTF16Object alloc] initWithString:newString] ;
[skin pushNSObject:newObject] ;
return 1 ;
}
/// hs.text.utf16:unicodeComposition([compatibilityMapping]) -> utf16TextObject
/// Method
/// Create a new utf16TextObject with the contents of the parent normalized using Unicode Normalization Form (K)C.
///
/// Paramters:
/// * `compatibilityMapping` - an optionabl boolean, default `false`, specifying whether compatibility mapping (true) should be used (Normalization Form KC) or canonical mapping (false) should be used (Normalization Form C) when normalizing the text.
///
/// Returns:
/// * a new utf16TextObject with the contents of the parent normalized using Unicode NormalizationForm (K)C.
///
/// Notes:
/// * At its most basic, normalization is useful when comparing strings which may have been composed differently (e.g. a single UTF16 character representing an accented `á` vs the visually equivalent composed character sequence of an `a` followed by U+0301) or use stylized versions of characters or numbers (e.g. `1` vs `①`), but need to be compared for their "visual" or "intended" equivalance.
///
/// * see http://www.unicode.org/reports/tr15/ for a more complete discussion of the various types of Unicode Normalization and the differences/strengths/weaknesses of each.
///
/// * See also [hs.text.utf16:unicodeDecomposition](#unicodeDecomposition)
static int utf16_unicodeComposition(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TBOOLEAN | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
BOOL compatibility = (lua_gettop(L) > 1) ? (BOOL)(lua_toboolean(L, 2)) : NO ;
NSString *newString = compatibility ? objString.precomposedStringWithCompatibilityMapping
: objString.precomposedStringWithCanonicalMapping ;
HSTextUTF16Object *newObject = [[HSTextUTF16Object alloc] initWithString:newString] ;
[skin pushNSObject:newObject] ;
return 1 ;
}
/// hs.text.utf16:unitCharacter([i], [j]) -> integer, ...
/// Method
/// Returns the UTF16 unit character codes for the range specified
///
/// Paramters:
/// * `i` - an optional integer, default 1, specifying the starting indexof the UTF16 character to begin at; negative indicies are counted from the end of the string.
/// * `j` - an optional integer, default the value of `i`, specifying the end of the range; negative indicies are counted from the end of the string.
///
/// Returns:
/// * zero or more integers representing the individual utf16 "characters" of the object within the range specified
///
/// Notes:
/// * this method returns the 16bit integer corresponding to the UTF16 "character" at the indicies specified. Surrogate pairs *are* treated as two separate "characters" by this method, so the initial or final character may be a broken surrogate -- see [hs.text.utf16.isHighSurrogate](#isHighSurrogate) and [hs.text.utf16.isLowSurrogate](#isLowSurrogate).
///
/// * this method follows the semantics of `utf8.codepoint` -- if a specified index is out of range, a lua error is generated.
static int utf16_unitCharacter(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
lua_Integer i = (lua_gettop(L) > 1) ? lua_tointeger(L, 2) : 1 ;
lua_Integer j = (lua_gettop(L) > 2) ? lua_tointeger(L, 3) : i ;
lua_Integer length = (lua_Integer)objString.length ;
// adjust indicies per lua standards
if (i < 0) i = length + 1 + i ; // negative indicies are from string end
if (j < 0) j = length + 1 + j ; // negative indicies are from string end
// match behavior of utf8.codepoint -- it's a little more anal then string.sub about indicies...
if ((i < 1) || (i > length)) return luaL_argerror(L, 2, "out of range") ;
if ((j < 1) || (j > length)) return luaL_argerror(L, 3, "out of range") ;
int count = 0 ;
while(i <= j) {
unichar codeUnit = [objString characterAtIndex:(NSUInteger)(i - 1)] ;
lua_pushinteger(L, (lua_Integer)codeUnit) ;
count++ ;
i++ ;
}
return count ;
}
/// hs.text.utf16:composedCharacterRange([i], [j]) -> start, end
/// Method
/// Returns the starting and ending index of the specified range, adjusting for composed characters or surrogate pairs at the beginning and end of the range.
///
/// Paramters:
/// * `i` - an optional integer, default 1, specifying the starting index of the UTF16 character to begin at; negative indicies are counted from the end of the string.
/// * `j` - an optional integer, default the value of `i`, specifying the end of the range; negative indicies are counted from the end of the string.
///
/// Returns:
/// * the `start` and `end` indicies for the range of characters specified by the initial range
///
/// Notes:
/// * if the unit character at index `i` specifies a low surrogate or is in the middle of a mulit-"character" composed character, `start` will be < `i`
/// * likewise if `j` is in the middle of a multi-"character" composition or surrogate, `end` will be > `j`.
///
/// * this method follows the semantics of `utf8.codepoint` -- if a specified index is out of range, a lua error is generated.
static int utf16_composedCharacterRange(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
lua_Integer i = (lua_gettop(L) > 1) ? lua_tointeger(L, 2) : 1 ;
lua_Integer j = (lua_gettop(L) > 2) ? lua_tointeger(L, 3) : i ;
lua_Integer length = (lua_Integer)objString.length ;
// adjust indicies per lua standards
if (i < 0) i = length + 1 + i ; // negative indicies are from string end
if (j < 0) j = length + 1 + j ; // negative indicies are from string end
// match behavior of utf8.codepoint -- it's a little more anal then string.sub about indicies...
if ((i < 1) || (i > length)) return luaL_argerror(L, 2, "out of range") ;
if ((j < 1) || (j > length)) return luaL_argerror(L, 3, "out of range") ;
NSRange targetRange ;
if (i == j) {
targetRange = [objString rangeOfComposedCharacterSequenceAtIndex:(NSUInteger)(i - 1)] ;
} else {
NSUInteger loc = (NSUInteger)i - 1 ;
NSUInteger len = (NSUInteger)j - loc ;
NSRange range = NSMakeRange(loc, len) ;
targetRange = [objString rangeOfComposedCharacterSequencesForRange:range] ;
}
lua_pushinteger(L, (lua_Integer)(targetRange.location + 1)) ;
lua_pushinteger(L, (lua_Integer)(targetRange.location + targetRange.length)) ;
return 2 ;
}
/// hs.text.utf16:capitalize([locale]) -> utf16TextObject
/// Method
/// Returns a copy of the utf16TextObject with all words capitalized.
///
/// Paramters:
/// * `locale` - an optional string or boolean (default ommitted) specifying whether to consider localization when determining how to capitalize words.
/// * if this parameter is ommitted, uses canonical (non-localized) mapping suitable for programming operations that require stable results not depending on the current locale.
/// * if this parameter is the boolean `false` or `nil`, uses the system locale
/// * if this parameter is the boolean `true`, uses the users current locale
/// * if this parameter is a string, the locale specified by the string is used. (See `hs.host.locale.availableLocales()` for valid locale identifiers)
///
/// Returns:
/// * a new utf16TextObject containing the capitalized version of the source
///
/// Notes:
/// * For the purposes of this methif, a capitalized string is a string with the first character in each word changed to its corresponding uppercase value, and all remaining characters set to their corresponding lowercase values. A word is any sequence of characters delimited by spaces, tabs, or line terminators. Some common word delimiting punctuation isn’t considered, so this property may not generally produce the desired results for multiword strings.
static int utf16_capitalize(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TSTRING | LS_TBOOLEAN | LS_TNIL | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
BOOL useLocale = (lua_gettop(L) == 2) ;
NSString *locale = (lua_type(L, 2) == LUA_TSTRING) ? [skin toNSObjectAtIndex:2] : nil ;
NSString *newString = nil ;
if (useLocale) {
NSLocale *specifiedLocale = lua_toboolean(L, 2) ? [NSLocale currentLocale] : nil ; // handles boolean/nil
if (locale) {
specifiedLocale = [NSLocale localeWithLocaleIdentifier:locale] ;
if (!specifiedLocale) {
return luaL_argerror(L, 2, "unrecognized locale specified") ;
}
}
newString = [objString capitalizedStringWithLocale:specifiedLocale] ;
} else {
newString = objString.capitalizedString ;
}
HSTextUTF16Object *newObject = [[HSTextUTF16Object alloc] initWithString:newString] ;
[skin pushNSObject:newObject] ;
return 1 ;
}
// documented in `init.lua`
static int utf16_compare(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG,
LS_TANY,
LS_TNUMBER | LS_TINTEGER | LS_TSTRING | LS_TNIL | LS_TOPTIONAL,
LS_TSTRING | LS_TBOOLEAN | LS_TNIL | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
NSString *target = [NSString stringWithUTF8String:luaL_tolstring(L, 2, NULL)] ;
lua_pop(L, 1) ;
if (lua_type(L, 2) == LUA_TUSERDATA) {
[skin checkArgs:LS_TANY, LS_TUSERDATA, UTF16_UD_TAG, LS_TBREAK | LS_TVARARG] ;
HSTextUTF16Object *targetObject = [skin toNSObjectAtIndex:2] ;
target = targetObject.utf16string ;
}
NSStringCompareOptions options = 0 ;
int localeIdx = 3 ;
if (lua_type(L, 3) == LUA_TNUMBER) {
localeIdx++ ;
options = (NSStringCompareOptions)(lua_tointeger(L, 3)) ;
}
BOOL useLocale = (lua_gettop(L) == localeIdx) ;
NSString *locale = (lua_type(L, localeIdx) == LUA_TSTRING) ? [skin toNSObjectAtIndex:localeIdx] : nil ;
NSRange compareRange = NSMakeRange(0, objString.length) ;
NSComparisonResult result ;
if (useLocale) {
NSLocale *specifiedLocale = lua_toboolean(L, localeIdx) ? [NSLocale currentLocale] : nil ; // handles boolean/nil
if (locale) {
specifiedLocale = [NSLocale localeWithLocaleIdentifier:locale] ;
if (!specifiedLocale) {
return luaL_argerror(L, localeIdx, "unrecognized locale specified") ;
}
}
result = [objString compare:target options:options range:compareRange locale:specifiedLocale] ;
} else {
result = [objString compare:target options:options range:compareRange] ;
}
switch (result) {
case NSOrderedAscending: lua_pushinteger(L, -1) ; break ;
case NSOrderedSame: lua_pushinteger(L, 0) ; break ;
case NSOrderedDescending: lua_pushinteger(L, 1) ; break ;
default:
[skin logError:[NSString stringWithFormat:@"%s:compare - unexpected comparison result of %ld when comparing %@ and %@", UTF16_UD_TAG, result, objString, target]] ;
lua_pushinteger(L, -999) ;
}
return 1 ;
}
#pragma mark * From lua string library *
/// hs.text.utf16:upper([locale]) -> utf16TextObject
/// Method
/// Returns a copy of the utf16TextObject with an uppercase representation of the source.
///
/// Paramters:
/// * `locale` - an optional string or boolean (default ommitted) specifying whether to consider localization when determining how change case.
/// * if this parameter is ommitted, uses canonical (non-localized) mapping suitable for programming operations that require stable results not depending on the current locale.
/// * if this parameter is the boolean `false` or `nil`, uses the system locale
/// * if this parameter is the boolean `true`, uses the users current locale
/// * if this parameter is a string, the locale specified by the string is used. (See `hs.host.locale.availableLocales()` for valid locale identifiers)
///
/// Returns:
/// * a new utf16TextObject containing an uppercase representation of the source.
///
/// Notes:
/// * This method is the utf16 equivalent of lua's `string.upper`
/// * Case transformations aren’t guaranteed to be symmetrical or to produce strings of the same lengths as the originals.
static int utf16_string_upper(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TSTRING | LS_TBOOLEAN | LS_TNIL | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
BOOL useLocale = (lua_gettop(L) == 2) ;
NSString *locale = (lua_type(L, 2) == LUA_TSTRING) ? [skin toNSObjectAtIndex:2] : nil ;
NSString *newString = nil ;
if (useLocale) {
NSLocale *specifiedLocale = lua_toboolean(L, 2) ? [NSLocale currentLocale] : nil ; // handles boolean/nil
if (locale) {
specifiedLocale = [NSLocale localeWithLocaleIdentifier:locale] ;
if (!specifiedLocale) {
return luaL_argerror(L, 2, "unrecognized locale specified") ;
}
}
newString = [objString uppercaseStringWithLocale:specifiedLocale] ;
} else {
newString = objString.uppercaseString ;
}
HSTextUTF16Object *newObject = [[HSTextUTF16Object alloc] initWithString:newString] ;
[skin pushNSObject:newObject] ;
return 1 ;
}
/// hs.text.utf16:lower([locale]) -> utf16TextObject
/// Method
/// Returns a copy of the utf16TextObject with an lowercase representation of the source.
///
/// Paramters:
/// * `locale` - an optional string or boolean (default ommitted) specifying whether to consider localization when determining how change case.
/// * if this parameter is ommitted, uses canonical (non-localized) mapping suitable for programming operations that require stable results not depending on the current locale.
/// * if this parameter is the boolean `false` or `nil`, uses the system locale
/// * if this parameter is the boolean `true`, uses the users current locale
/// * if this parameter is a string, the locale specified by the string is used. (See `hs.host.locale.availableLocales()` for valid locale identifiers)
///
/// Returns:
/// * a new utf16TextObject containing an lowercase representation of the source.
///
/// Notes:
/// * This method is the utf16 equivalent of lua's `string.lower`
/// * Case transformations aren’t guaranteed to be symmetrical or to produce strings of the same lengths as the originals.
static int utf16_string_lower(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TSTRING | LS_TBOOLEAN | LS_TNIL | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
BOOL useLocale = (lua_gettop(L) == 2) ;
NSString *locale = (lua_type(L, 2) == LUA_TSTRING) ? [skin toNSObjectAtIndex:2] : nil ;
NSString *newString = nil ;
if (useLocale) {
NSLocale *specifiedLocale = lua_toboolean(L, 2) ? [NSLocale currentLocale] : nil ; // handles boolean/nil
if (locale) {
specifiedLocale = [NSLocale localeWithLocaleIdentifier:locale] ;
if (!specifiedLocale) {
return luaL_argerror(L, 2, "unrecognized locale specified") ;
}
}
newString = [objString lowercaseStringWithLocale:specifiedLocale] ;
} else {
newString = objString.lowercaseString ;
}
HSTextUTF16Object *newObject = [[HSTextUTF16Object alloc] initWithString:newString] ;
[skin pushNSObject:newObject] ;
return 1 ;
}
/// hs.text.utf16:len() -> integer
/// Method
/// Returns the length in UTF16 characters in the object
///
/// Parameters:
/// * None
///
/// Returns:
/// * the number of UTF16 characterss in the object
///
/// Notes:
/// * This method is the utf16 equivalent of lua's `string.len`
/// * Composed character sequences and surrogate pairs are made up of multiple UTF16 "characters"; see also [hs.text.utf16:characterCount](#characterCount) wihch offers more options.
static int utf16_string_length(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
// when used as the metmethod __len, we may get "self" provided twice, so let's just check the first arg
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TBREAK | LS_TVARARG] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
lua_pushinteger(L, (lua_Integer)objString.length) ;
return 1 ;
}
/// hs.text.utf16:sub([i], [j]) -> utf16TextObject
/// Method
/// Returns a new utf16TextObject containing a substring of the source object
///
/// Parameters:
/// * `i` - an integer specifying the starting index of the substring; negative indicies are counted from the end of the string.
/// * `j` - an optional integer, default -1, specifying the end of the substring; negative indicies are counted from the end of the string.
///
/// Returns:
/// * a new utf16TextObject containing a substring of the source object as delimited by the indicies `i` and `j`
///
/// Notes:
/// * This method is the utf16 equivalent of lua's `string.sub`
/// * In particular, `hs.text.utf16:sub(1, j)` will return the prefix of the source with a length of `j`, and `hs.text.utf16:sub(-i)` returns the suffix of the source with a length of `i`.
///
/// * This method uses the specific indicies provided, which could result in a broken surrogate or composed character sequence at the begining or end of the substring. If this is a concern, use [hs.text.utf16:composedCharacterRange](#composedCharacterRange) to adjust the range values before invoking this method.
static int utf16_string_sub(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TNUMBER | LS_TINTEGER, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
lua_Integer i = lua_tointeger(L, 2) ;
lua_Integer j = (lua_gettop(L) > 2) ? lua_tointeger(L, 3) : -1 ;
lua_Integer length = (lua_Integer)objString.length ;
// adjust indicies per lua standards
if (i < 0) i = length + 1 + i ; // negative indicies are from string end
if (j < 0) j = length + 1 + j ; // negative indicies are from string end
if (i < 1) i = 1 ; // if i still less than 1, force to 1
if (j > length) j = length ; // if j greater than length, force to length
NSString *subString = @"" ;
if (!((i > length) || (j < i))) { // i.e. indices are within range
// now find Objective-C index and length
NSUInteger loc = (NSUInteger)i - 1 ;
NSUInteger len = (NSUInteger)j - loc ;
NSRange range = NSMakeRange(loc, len) ;
subString = [objString substringWithRange:range] ;
}
HSTextUTF16Object *newObject = [[HSTextUTF16Object alloc] initWithString:subString] ;
[skin pushNSObject:newObject] ;
return 1 ;
}
/// hs.text.utf16:reverse() -> utf16TextObject
/// Method
/// Returns a new utf16TextObject with the characters reveresed.
///
/// Parameters:
/// * None
///
/// Returns:
/// * a new utf16TextObject with the characters reveresed
///
/// Notes:
/// * This method is the utf16 equivalent of lua's `string.reverse`
/// * Surrogate pairs and composed character sequences are maintained, so the reversed object will be composed of valid UTF16 sequences (assuming, of course, that the original object was composed of valid UTF16 sequences)
static int utf16_string_reverse(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
// Courtesy of https://stackoverflow.com/a/6730329
NSMutableString *reversedString = [NSMutableString stringWithCapacity:objString.length] ;
[objString enumerateSubstringsInRange:NSMakeRange(0, objString.length)
options:(NSStringEnumerationReverse | NSStringEnumerationByComposedCharacterSequences)
usingBlock:^(NSString *substring, __unused NSRange substringRange, __unused NSRange enclosingRange, __unused BOOL *stop) {
[reversedString appendString:substring] ;
}
] ;
HSTextUTF16Object *newObject = [[HSTextUTF16Object alloc] initWithString:reversedString] ;
[skin pushNSObject:newObject] ;
return 1 ;
}
#pragma mark * From lua utf8 library *
/// hs.text.utf16:codpoint([i], [j]) -> integer, ...
/// Method
/// Returns the Unicode Codepoints for all characters in the utf16TextObject between the specified indicies.
///
/// Paramters:
/// * `i` - an optional integer, default 1, specifying the starting index of the UTF16 character to begin at; negative indicies are counted from the end of the string.
/// * `j` - an optional integer, default the value of `i`, specifying the end of the range; negative indicies are counted from the end of the string.
///
/// Returns:
/// * zero or more integers representing the Unicode Codepoints of the UTF16 "character" at the indicies specified.
///
/// Notes:
/// * This method is the utf16 equivalent of lua's `utf8.codepoint` and follows the same semantics -- if a specified index is out of range, a lua error is generated.
/// * This method differs from [hs.text.uf16:unitCharacter](#unitCharacter) in that surrogate pairs will result in a single codepoint between U+010000 to U+10FFFF instead of two separate UTF16 characters.
static int utf16_utf8_codepoint(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TBREAK] ;
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
lua_Integer i = (lua_gettop(L) > 1) ? lua_tointeger(L, 2) : 1 ;
lua_Integer j = (lua_gettop(L) > 2) ? lua_tointeger(L, 3) : i ;
lua_Integer length = (lua_Integer)objString.length ;
// adjust indicies per lua standards
if (i < 0) i = length + 1 + i ; // negative indicies are from string end
if (j < 0) j = length + 1 + j ; // negative indicies are from string end
// match behavior of utf8.codepoint -- it's a little more anal then string.sub about indicies...
if ((i < 1) || (i > length)) return luaL_argerror(L, 2, "out of range") ;
if ((j < 1) || (j > length)) return luaL_argerror(L, 3, "out of range") ;
int count = 0 ;
if (CFStringIsSurrogateLowCharacter([objString characterAtIndex:(NSUInteger)(i - 1)])) {
// initial index is in the middle of a surrogate pair
return luaL_error(L, "invalid UTF-16 code") ;
}
while(i <= j) {
unichar ch1 = [objString characterAtIndex:(NSUInteger)(i - 1)] ;
uint32_t codepoint = ch1 ;
if (CFStringIsSurrogateHighCharacter(ch1)) {
i++ ; // surrogate pair, so get second half
if (i > length) {
// if we've exceded the string length, then string ends with a broken surrogate pair
return luaL_error(L, "invalid UTF-16 code") ;
}
unichar ch2 = [objString characterAtIndex:(NSUInteger)(i - 1)] ;
codepoint = CFStringGetLongCharacterForSurrogatePair(ch1, ch2) ;
}
lua_pushinteger(L, (lua_Integer)codepoint) ;
i++ ;
count++ ;
}
return count ;
}
/// hs.text.utf16:characterCount([composedCharacters], [i], [j]) -> integer | nil, integer
/// Method
/// Returns the number of UTF16 characters in the utf16TextObject between the specified indicies.
///
/// Paramters:
/// * `composedCharacters` - an optional boolean, default `false`, specifying whether or not composed character sequences should be treated as a single character (true) or count for as many individual UTF16 "characters" as are actually used to specify the sequence (false).
/// * `i` - an optional integer, default 1, specifying the starting index of the UTF16 character to begin at; negative indicies are counted from the end of the string.
/// * `j` - an optional integer, default -1, specifying the end of the range; negative indicies are counted from the end of the string.
///
/// Returns:
/// * if no invalid sequences are found (see next), returns the number of Unicode characters in the range specified.
/// * if an invalid sequence is found (specifically an isolated low or high surrogate or compoased character sequence that starts or ends outside of the specified range when `composedCharacters` is `true`, returns `nil` and the index position of the first invalid UTF16 character.
///
/// Notes:
/// * This method is similar to lua's `utf8.len` and follows the same semantics -- if a specified index is out of range, a lua error is generated.
/// * This method differs from [hs.text.uf16:len](#len) in that surrogate pairs count as one character and composed characters can optionally be considered a single character as well.
static int utf16_utf8_len(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
int iIdx = 2 ;
BOOL charactersComposed = NO ;
if (lua_type(L, 2) == LUA_TBOOLEAN) {
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TBOOLEAN, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TBREAK] ;
iIdx++ ;
charactersComposed = (BOOL)(lua_toboolean(L, 2)) ;
} else {
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TBREAK] ;
}
// horked from lutf8lib.c... I *think* I understand it...
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
lua_Integer i = (lua_gettop(L) >= iIdx) ? lua_tointeger(L, iIdx) : 1 ;
lua_Integer j = (lua_gettop(L) > iIdx) ? lua_tointeger(L, iIdx + 1) : -1 ;
lua_Integer length = (lua_Integer)objString.length ;
if (i < 0) i = (length + i < 0) ? 0 : (length + i + 1) ;
if (j < 0) j = (length + j < 0) ? 0 : (length + j + 1) ;
luaL_argcheck(L, 1 <= i && --i <= length, iIdx, "initial position out of string") ;
luaL_argcheck(L, --j < length, iIdx + 1, "final position out of string") ;
lua_Integer n = 0 ;
while (i <= j) {
unichar ch1 = [objString characterAtIndex:(NSUInteger)i] ;
if (CFStringIsSurrogateHighCharacter(ch1)) {
if ((i < j) && CFStringIsSurrogateLowCharacter([objString characterAtIndex:(NSUInteger)(i + 1)])) {
i +=2 ; // valid surrogate pair in range
} else {
// not followed by low surrogate or low surrogate out of range
lua_pushnil(L) ;
lua_pushinteger(L, i + 1) ;
return 2 ;
}
} else if (!CFStringIsSurrogateLowCharacter(ch1)) {
if (charactersComposed) {
NSRange cc = [objString rangeOfComposedCharacterSequenceAtIndex:(NSUInteger)i] ;
// [skin logWarn:[NSString stringWithFormat:@"(i, j, cc.loc, cc.len) == (%lld, %lld, %lu, %lu)", i, j, cc.location, cc.length]] ;
if ((cc.location == (NSUInteger)i) && ((i + (lua_Integer)cc.length - 1) <= j)) {
i += (lua_Integer)cc.length ; // valid composed character
} else {
lua_pushnil(L) ;
if (cc.location == (NSUInteger)i) {
// composed character extends beyond range
lua_pushinteger(L, j + 1) ;
} else {
// not at begining of composed character
lua_pushinteger(L, i + 1) ;
}
return 2 ;
}
} else {
i++ ; // valid single unit
}
} else {
// char is lone low surrogate
lua_pushnil(L) ;
lua_pushinteger(L, i + 1) ;
return 2 ;
}
n++ ;
}
lua_pushinteger(L, n) ;
return 1 ;
}
/// hs.text.utf16:offset([composedCharacters], n, [i]) -> integer | nil
/// Method
/// Returns the position (in UTF16 characters) where the encoding of the `n`th character of the utf16TextObject begins.
///
/// Paramters:
/// * `composedCharacters` - an optional boolean, default `false` specifying whether or not composed character sequences should be considered as a single UTF16 character (true) or as the individual characters that make up the sequence (false).
/// * `n` - an integer specifying the UTF16 character number to get the offset for, starting from position `i`. If `n` is negative, gets specifies the number of characters before position `i`.
/// * `i` - an optional integer, default 1 when `n` is non-negative or [hs.text.utf16:len](#len) + 1 when `n` is negative, specifiying the starting character from which to count `n`.
///
/// Returns:
/// * the index of the utf16TextObject where the `n`th character begins or nil if no such character exists. As a special case when `n` is 0, returns the offset of the start of the character that contains the `i`th UTF16 character of the utf16Text obejct.
///
/// Notes:
/// * This method is the utf16 equivalent of lua's `utf8.offset`.
static int utf16_utf8_offset(lua_State *L) {
LuaSkin *skin = [LuaSkin sharedWithState:L] ;
int nIdx = 2 ;
BOOL charactersComposed = NO ;
if (lua_type(L, 2) == LUA_TBOOLEAN) {
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TBOOLEAN, LS_TNUMBER | LS_TINTEGER, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TBREAK] ;
nIdx++ ;
charactersComposed = (BOOL)(lua_toboolean(L, 2)) ;
} else {
[skin checkArgs:LS_TUSERDATA, UTF16_UD_TAG, LS_TNUMBER | LS_TINTEGER, LS_TNUMBER | LS_TINTEGER | LS_TOPTIONAL, LS_TBREAK] ;
}
// horked from lutf8lib.c... I *think* I understand it...
HSTextUTF16Object *utf16Object = [skin toNSObjectAtIndex:1] ;
NSString *objString = utf16Object.utf16string ;
lua_Integer n = lua_tointeger(L, nIdx) ;
lua_Integer length = (lua_Integer)objString.length ;
lua_Integer i = (lua_gettop(L) > nIdx) ? lua_tointeger(L, nIdx + 1) : ((n > -1) ? 1 : (length + 1)) ;
if (i < 0) i = (length + i < 0) ? 0 : (length + i + 1) ;
luaL_argcheck(L, 1 <= i && --i <= length, nIdx + 1, "position out of range") ;
if (n == 0) {
while (i > 0 && inMiddleOfChar(objString, (NSUInteger)i, charactersComposed)) i-- ;
} else {
if (inMiddleOfChar(objString, (NSUInteger)i, charactersComposed)) return luaL_error(L, "initial position is in middle of surrogate pair or composed character sequence") ;
if (n < 0) {
while (n < 0 && i > 0) { // move back
do { // find beginning of previous character
i-- ;
} while (i > 0 && inMiddleOfChar(objString, (NSUInteger)i, charactersComposed)) ;
n++ ;
}
} else {
n-- ; // do not move for 1st character
while (n > 0 && i < length) {
do { // find beginning of next character
i++ ;
} while (inMiddleOfChar(objString, (NSUInteger)i, charactersComposed)) ; // (cannot pass final '\0')
n-- ;
}
}
}
if (n == 0) { // did it find given character?
lua_pushinteger(L, i + 1) ;
} else { // no such character
lua_pushnil(L) ;
}
return 1 ;
}