-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathU.cc
462 lines (370 loc) · 12.1 KB
/
U.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
// Name: Maxwell You
// Date: 2017-04-29
// Purpose: Overload operators
#include "U.h"
#include <iostream>
#include <fstream>
#include <sstream>
using namespace std;
// Default ctor using initializer list
U::U() : charsRead("") { } // initialized charsRead to be empty
// Copy ctor
U::U(const U & rhs) : charsRead(rhs.charsRead), charsReadVect(rhs.charsReadVect) { } // copy all rhs's class members (private vars) to
// Filename ctor
U::U(string filename) {
charsRead = ""; // initialize string to empty string
readfile(filename); // call readfile to do the work
}
// Assignment operator=
const U & U::operator=(const U & rhs) {
if (this != &rhs) { // rhs by itself is the actual object, but with the & (& rhs), it get the pointer to rhs, which is also what this is (a pointer to the object)
charsRead = rhs.charsRead; // assign contents of rhs's charsRead to this object
charsReadVect = rhs.charsReadVect; // assign contents of rhs's charsReadVect to this obj
}
return *this; // this is just a pointer, *this is the actual object
}
// Dtor
U::~U( ) { } // default destructor
// Error check ifstream for failure
void U::streamFail(ifstream & in, int byteNum, string filename) const {
if (in.fail()) {
ostringstream oss;
oss << "Byte " << byteNum << " was unable to be retrieved (doesn't exist) in file: " << filename;
throw oss.str();
}
}
// Error check for invalid continuation byte
void U::contByteFail(int byte) const {
if ( (byte & 0xC0) != 0x80 ) {
ostringstream oss;
oss << "0x" << hex << byte << " was not a valid continuation byte";
throw oss.str();
}
}
// Determines byte length of character
int U::bytes(int byte) const {
if ( (byte & 0x80) == 0 )
return 1;
else if ( (byte & 0xE0) == 0xC0 )
return 2;
else if ( (byte & 0xF0) == 0xE0 )
return 3;
else if ( (byte & 0xF8) == 0xF0 )
return 4;
else { // means byte is not valid UTF8 leading byte
ostringstream oss;
oss << "Not a valid UTF8 character. Bad leading byte: 0x"<< hex << byte;
throw oss.str();
}
}
// Read and check for valid UTF8 characters
void U::readUTF(int byte1, ifstream & in, string filename) {
char fbyte = byte1; // used to concatenate first byte to the charsRead string
char c; // to get the additional UTF bytes
// 1 byte
if (bytes(byte1) == 1)
charsRead += fbyte; // add first byte of char to charsRead
// 2 bytes
else if (bytes(byte1) == 2) {
charsRead += fbyte; // add first byte of char to charsRead
in.get(c);
streamFail(in, 2, filename); // check if the stream failed to get a character
int byte2 = c;
contByteFail(byte2); // check if this is a valid continuation byte
charsRead += c;
}
// 3 bytes
else if (bytes(byte1) == 3) {
charsRead += fbyte;
in.get(c);
streamFail(in, 2, filename);
int byte2 = c;
contByteFail(byte2);
charsRead += c;
in.get(c);
streamFail(in, 3, filename);
int byte3 = c;
contByteFail(byte3);
charsRead += c;
}
// 4 bytes
else if (bytes(byte1) == 4) {
charsRead += fbyte; // add first byte of char to charsRead
in.get(c);
streamFail(in, 2, filename);
int byte2 = c;
contByteFail(byte2);
charsRead += c;
in.get(c);
streamFail(in, 3, filename);
int byte3 = c;
contByteFail(byte3);
charsRead += c;
in.get(c);
streamFail(in, 4, filename);
int byte4 = c;
contByteFail(byte4);
charsRead += c;
}
}
// Create vector to store UTF8 chars from accumulated string
void U::createUTFVect() {
for (uint i = 0; i < charsRead.length(); /* leave incrementing work for body of loop */) {
if (bytes(charsRead.at(i)) == 1) { // evaluate # of bytes a character consists of
charsReadVect.push_back(charsRead.substr(i, 1)); // add the char to the vector (no need to error check since readUTF already did)
++i; // incr once because char was 1 byte
} else if (bytes(charsRead.at(i)) == 2) {
charsReadVect.push_back(charsRead.substr(i, 2));
i += 2;
} else if (bytes(charsRead.at(i)) == 3) {
charsReadVect.push_back(charsRead.substr(i, 3));
i += 3;
} else if (bytes(charsRead.at(i)) == 4) {
charsReadVect.push_back(charsRead.substr(i, 4));
i += 4;
}
}
}
// Read characters from an input file
void U::readfile(string filename) {
ifstream in(filename);
if (!in) {
ostringstream oss;
oss << "Unable to read file: " << filename;
throw oss.str();
}
char c;
while(in.get(c)) {
if (in.fail()) {
ostringstream oss;
oss << "Byte 1 was unable to be retrieved (doesn't exist) in file: " << filename;
throw oss.str();
}
readUTF(c, in, filename);
}
in.close(); // close the file stream
createUTFVect(); // create vector from this accumulated string
}
void U::append(string extra) {
string utfChar; // stores all bytes of UTF8 character as a string
for (uint i = 0; i < extra.length(); /* let loop body incr */) {
/* These conditional will run convUTF based on the # of bytes a character is
* convUTF is passed the current character (which should be the start of a UTF8 charac)
* and it will pass it the rest of the character's bytes too
* The index, i, gets incremented accordingly based on # of bytes
*/
if (bytes(extra.at(i)) == 1) { // evaluate # of bytes a character consists of
utfChar = extra.substr(i, 1); // get 1 character from string
convUTF(extra.at(i), utfChar); // use convUTF to check for invalid UTF8 char
charsRead += utfChar; // add the char to the accumulated string
charsReadVect.push_back(utfChar); // add the char to the vector
++i; // incr once because char was 1 byte
} else if (bytes(extra.at(i)) == 2) {
utfChar = extra.substr(i, 2);
convUTF(extra.at(i), utfChar);
charsRead += utfChar;
charsReadVect.push_back(utfChar);
i += 2;
} else if (bytes(extra.at(i)) == 3) {
utfChar = extra.substr(i, 3);
convUTF(extra.at(i), extra.substr(i, 3));
charsRead += utfChar;
charsReadVect.push_back(utfChar);
i += 3;
} else if (bytes(extra.at(i)) == 4) {
utfChar = extra.substr(i, 4);
convUTF(extra.at(i), extra.substr(i, 4));
charsRead += utfChar;
charsReadVect.push_back(utfChar);
i += 4;
}
// no need to error check since convUTF() and bytes() will take care of that
}
}
// State how many characters read thus far
int U::size() const {
return charsReadVect.size(); // return length of the vector
}
// Get all chars read thus far
string U::get() const {
return charsRead;
}
// Get char at index
string U::get(int index) const {
if (index > (this->size() - 1) || index < 0) {
ostringstream oss;
oss << "Invalid index: " << index << " (valid range: [0," << this->size() << "))";
throw oss.str();
}
return charsReadVect.at(index);
}
// Get chars from start to end
string U::get(int start, int end) const {
if (start < 0 || end > (this->size() - 1)) {
ostringstream oss;
if (start < 0 && end > this->size() - 1)
oss << "Invalid indices: (" << start << ", " << end << ") (valid range: [0," << this->size() << "))";
else if (start < 0)
oss << "Invalid index: " << start << " (valid range: [0," << this->size() << "))";
else
oss << "Invalid index: " << end << " (valid range: [0," << this->size() << "))";
throw oss.str();
}
string res = "";
for (int i = start; i < end; ++i) {
res += charsReadVect.at(i);
}
return res;
}
int U::convUTF(int byte1, string charac) const { // index is passed by reference because it will change the actual value of index
// in the loop this function will be called in
char c; // to get the additional UTF bytes
if ( (byte1 & 0x80) == 0 )
return (byte1 & 0x7F);
if ( (byte1 & 0xE0) == 0xC0 ) {
c = charac.at(1);
int byte2 = c;
contByteFail(byte2);
byte1 = (byte1 << 6) & 0x7C0;
byte2 = byte2 & 0x3F;
return (byte1 | byte2);
}
if ( (byte1 & 0xF0) == 0xE0) {
c = charac.at(1);
int byte2 = c;
contByteFail(byte2);
c = charac.at(2);
int byte3 = c;
contByteFail(byte3);
byte1 = (byte1 << 12) & 0xF000; // shift to bits 15-12, mask upper 4 bits
byte2 = (byte2 << 6) & 0xFC0; // shift to bits 11-6, mask bits 12-7
byte3 = byte3 & 0x3F; // shift to bits 5-0, mask bits 5-0
return (byte1 | byte2 | byte3); // combine the 3 bytes to get unicode
}
if ( (byte1 & 0xF8) == 0xF0 ) {
c = charac.at(1);
int byte2 = c;
contByteFail(byte2);
c = charac.at(2);
int byte3 = c;
contByteFail(byte3);
c = charac.at(3);
int byte4 = c;
contByteFail(byte4);
byte1 = (byte1 & 0x07) << 18; // get lower 3 bits of 1st byte, shift to bits 20-18
byte2 = (byte2 & 0x3F) << 12; // get lower 6 bits of 2nd byte, shift to bits 17-12
byte3 = (byte3 & 0x3F) << 6; // get lower 6 bits of 3rd byte, shift to bits 11-6
byte4 = byte4 & 0x3F; // get lower 6 bits of 4th byte, shift to bits 5-0
return (byte1 | byte2 | byte3 | byte4); // combine the 4 bytes to get unicode
}
return -1; // return -1 if not a valid unicode character
}
// Return the codepoint at charsRead[index]
int U::codepoint(int index) const {
string charac = get(index);
return convUTF(charac.at(0), charac);
}
// Return true if charsRead is empty
bool U::empty() const {
return charsReadVect.empty();
}
// Clear charsRead
void U::clear() {
charsRead = "";
charsReadVect.clear();
}
// Operator Overloading
// Assuming u is a U, p is a P, and s is a string in code below
// Assignment
U & U::operator=(const string & s) {
this->clear(); // clear the object because we are about to overwrite it
this->append(s); // append the new string
return *this; // return lhs(this)
}
// Append
U & U::operator+=(const string & s) {
this->append(s); // append will add the accumulated string in rhs to our accumulated string and vector
return *this; // return lhs(this)
}
U & U::operator+=(const U & rhs) {
U temp(rhs); // make a copy of rhs to ensure we won't modify it
this->append(temp.charsRead); // append will add the accumulated string in rhs to our accumulated string and vector
return *this; // return the lhs(this)
}
// Concatenation
// u + u
const U U::operator+(const U & rhs) const { //
U result(*this); // make a copy of the lhs(this) to ensure we won't modify it
result += rhs; // let operator+=(U) to do the work
return result; // return the result
}
// u + s
const U U::operator+(const string & s) const {
U result(*this); // make a copy of the lhs(this) to ensure we won't modify it
result += s; // let operator+=(string) do the work
return result; // return the result
}
// s + u
const U operator+(const string & s, const U & rhs) {
U result; // create a U to hold the string
result.append(s); // add string to result
result += rhs; // let operator+=(U) do the work
return result; // return the new string obj
}
// Subscripting
// u[index]
string U::operator[](int index) const {
return get(index); // let get(index) do the work
}
// Send u to output stream
// cout << u
ostream & operator<<(ostream & out, const U & rhs) {
out << rhs.get(); // get the accumulated string and send it to the output stream
return out;
}
// Boolean evaluation
U::operator bool() const {
return !(empty());
}
// Comparison
// u == u
bool U::operator==(const U & rhs) const {
return charsRead == rhs.charsRead; // compare the accumulated strings of the two objs
}
// u == s
bool U::operator==(const string & s) const {
return charsRead == s; // compare the accumulated string and s
}
// s == u
bool operator==(const string & s, const U & rhs) {
return s == rhs.charsRead; // compare the s and the accumulated string
}
// u != u
bool U::operator!=(const U & rhs) const {
return !(*this == rhs); // negate the result of u == u
}
// u != s
bool U::operator!=(const string & s) const {
return !(*this == s); // negate the result of u == s
}
// s != u
bool operator!=(const string & s, const U & rhs) {
return !(s == rhs); // negate the result of s == u
}
// Iterator methods
U::iterator U::begin() const {
return U::iterator(this, 0);
}
U::iterator U::end() const {
return U::iterator(this, size());
}
int U::front() const { // TODO check if should be const
if (size() == 0) // if nothing to
throw string("Attempting to retrieve front() item, but string is empty");
return codepoint(0);
}
int U::back() const { // TODO check if should be const
if (size() == 0)
throw string("Attempting to retrieve back() item, but string is empty");
return codepoint(size() - 1);
}