-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathfixlengths.rs
225 lines (202 loc) · 8.6 KB
/
fixlengths.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
static USAGE: &str = r#"
Transforms CSV data so that all records have the same length. The length is
the length of the longest record in the data (not counting trailing empty fields,
but at least 1). Records with smaller lengths are padded with empty fields.
This requires two complete scans of the CSV data: one for determining the
record size and one for the actual transform. Because of this, the input
given must be a file and not stdin.
Alternatively, if --length is set, then all records are forced to that length.
This requires a single pass and can be done with stdin.
Usage:
qsv fixlengths [options] [<input>]
qsv fixlengths --help
fixlengths options:
-l, --length <arg> Forcefully set the length of each record. If a
record is not the size given, then it is truncated
or expanded as appropriate.
-r, --remove-empty Remove empty columns.
-i, --insert <pos> If empty fields need to be inserted, insert them
at <pos>. If <pos> is zero, then it is inserted
at the end of each record. If <pos> is negative, it
is inserted from the END of each record going backwards.
If <pos> is positive, it is inserted from the BEGINNING
of each record going forward. [default: 0]
--quote <arg> The quote character to use. [default: "]
--escape <arg> The escape character to use. When not specified,
quotes are escaped by doubling them.
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
-Q, --quiet Don't print removed column information.
"#;
use std::cmp;
use serde::Deserialize;
use crate::{
config::{Config, Delimiter},
util, CliResult,
};
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
flag_length: Option<usize>,
flag_insert: i16,
flag_remove_empty: bool,
flag_quiet: bool,
flag_quote: Delimiter,
flag_escape: Option<Delimiter>,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let mut config = Config::new(args.arg_input.as_ref())
.delimiter(args.flag_delimiter)
.quote(args.flag_quote.as_byte())
.no_headers(true)
.flexible(true);
if let Some(escape) = args.flag_escape {
config = config.escape(Some(escape.as_byte())).double_quote(false);
}
if config.is_stdin() && args.flag_remove_empty {
return fail_incorrectusage_clierror!(
"<stdin> cannot be used with --remove-empty. Please specify a file path."
);
}
// First determine if we need to identify existing empty columns
let col_is_empty_vec_opt = if args.flag_remove_empty {
// First pass: identify existing empty columns
let mut rdr = config.reader()?;
let mut record = csv::ByteRecord::new();
let mut col_is_empty_vec: Vec<bool> = Vec::new();
let mut first = true;
while rdr.read_byte_record(&mut record)? {
if first {
col_is_empty_vec = vec![true; record.len()];
first = false;
}
for (i, field) in record.iter().enumerate() {
if !field.is_empty() {
col_is_empty_vec[i] = false;
}
}
}
// Count and report removed columns
let empty_count = col_is_empty_vec.iter().filter(|&&x| x).count();
if !args.flag_quiet && empty_count > 0 {
eprintln!("Removed {empty_count} empty column(s)");
}
Some(col_is_empty_vec)
} else {
None
};
let length = if let Some(length) = args.flag_length {
if length == 0 {
return fail_incorrectusage_clierror!("Length must be greater than 0.");
}
length
} else if args.flag_remove_empty {
// When removing empty columns, we need to determine the maximum length
// after filtering out the empty columns. This requires another pass through the data.
let mut rdr = config.reader()?;
let mut record = csv::ByteRecord::new();
let mut maxlen = 0_usize;
// Get reference to vector indicating which columns are empty
let col_is_empty_vec = col_is_empty_vec_opt.as_ref().unwrap();
while rdr.read_byte_record(&mut record)? {
// For each record, count how many fields would remain after filtering
// out the empty columns. We use get_unchecked since we know i is in bounds.
let filtered_len = record
.iter()
.enumerate()
.filter(|(i, _)| unsafe { !*col_is_empty_vec.get_unchecked(*i) })
.count();
// Keep track of maximum filtered length seen so far
maxlen = cmp::max(maxlen, filtered_len);
}
maxlen
} else {
if config.is_stdin() {
return fail_incorrectusage_clierror!(
"<stdin> cannot be used with if --length is not set. Please specify a file path."
);
}
let mut maxlen = 0_usize;
let mut rdr = config.reader()?;
let mut record = csv::ByteRecord::new();
while rdr.read_byte_record(&mut record)? {
let mut nonempty_count = 0;
for (index, field) in record.iter().enumerate() {
if index == 0 || !field.is_empty() {
nonempty_count = index + 1;
}
}
maxlen = cmp::max(maxlen, nonempty_count);
}
maxlen
};
let mut rdr = config.reader()?;
let mut wtr = Config::new(args.flag_output.as_ref()).writer()?;
let mut record = csv::ByteRecord::new();
let mut record_work = csv::ByteRecord::new();
let mut filtered_record = csv::ByteRecord::new();
let mut field_idx: i16;
let flag_insert = args.flag_insert;
let insert_pos = if flag_insert < 0 {
length as i16 + flag_insert
} else {
flag_insert
};
let mut current_len: usize;
while rdr.read_byte_record(&mut record)? {
// First remove existing empty columns if needed
if let Some(ref is_empty) = col_is_empty_vec_opt {
filtered_record.clear();
for (i, field) in record.iter().enumerate() {
if i < is_empty.len() && !is_empty[i] {
filtered_record.push_field(field);
}
}
record.clone_from(&filtered_record);
}
// Then handle length adjustments by comparing target length with current record length
current_len = record.len();
match length.cmp(¤t_len) {
std::cmp::Ordering::Greater => {
// Record is too short - need to add empty fields
if flag_insert == 0 {
// No insert position specified - append empty fields at end
record.extend((0..length - current_len).map(|_| b""));
} else {
// Insert empty fields at specified position
record_work.clear();
field_idx = 1_i16;
for field in &record {
if field_idx == insert_pos {
// When we reach insert position, add all needed empty fields
record_work.extend((0..length - current_len).map(|_| b""));
}
record_work.push_field(field);
field_idx += 1;
}
if record_work.len() < length {
// If we never hit insert position (it was past end of record),
// append remaining empty fields at the end
record_work.extend((0..length - record_work.len()).map(|_| b""));
}
record.clone_from(&record_work);
}
},
std::cmp::Ordering::Less => {
// Record is too long - truncate to target length
record.truncate(length);
},
std::cmp::Ordering::Equal => {
// Record is already correct length - no changes needed
},
}
wtr.write_byte_record(&record)?;
}
Ok(wtr.flush()?)
}