From da1db99fcc52e2f15b1b3dc37a31befe7403b557 Mon Sep 17 00:00:00 2001
From: khyox <jse.mnl@gmail.com>
Date: Tue, 23 Jan 2024 23:12:00 -0800
Subject: [PATCH] refafilt: add ability to expand multiple headers and show
 stats

On branch dev
	modified:   refafilt
---
 refafilt | 68 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 46 insertions(+), 22 deletions(-)

diff --git a/refafilt b/refafilt
index 471df40..ad24733 100755
--- a/refafilt
+++ b/refafilt
@@ -227,9 +227,10 @@ def main():
 
     # Main loop: filter FASTA file
     i: int = 0
-    pass_lens: dict[int] = []
-    tiny_lens: dict[int] = []
-    long_lens: dict[int] = []
+    pass_lens: list[int] = []
+    tiny_lens: list[int] = []
+    long_lens: list[int] = []
+    multiplicity: list[int] = []  # Multiplicity sequences per expanded header
     print(gray('Filtering FASTA file'), f'{in_fasta}', gray('...\nMseqs: '),
           end='')
     sys.stdout.flush()
@@ -244,8 +245,8 @@ def main():
           open(output_long, 'w') if max_filt else contextlib.nullcontext(
               None) as long_file
               ):
-        for i, (title, seq) in enumerate(SimpleFastaParser(fa)
-                                         ):
+        for i, (title, seq) in enumerate(SimpleFastaParser(fa)):
+            # Stop by max number of reads or print progress
             if args.maxreads and i >= args.maxreads:
                 print(yellow(' [stopping by maxreads limit]!'), end='')
                 break
@@ -256,26 +257,35 @@ def main():
                 print('.', end='')
                 sys.stdout.flush()
             seq_len : int = len(seq)
+            # Expand header
+            maxsplit = (-1 if expand else 0)
+            title_split = title.split('>', maxsplit=maxsplit)
+            if expand and ((num_headers := len(title_split)) > 1): 
+                multiplicity.append(num_headers) 
+                # Filter by length
             if min_filt and seq_len < min_len:
-                short_file.write(f'>{title}\n')
-                short_file.write(seq + '\n')
-                stats['seq_tiny'] += 1
-                stats['nt_tiny'] += seq_len         
-                tiny_lens.append(seq_len)
+                for header in title_split:
+                    short_file.write(f'>{header}\n')
+                    short_file.write(seq + '\n')
+                    stats['seq_tiny'] += 1
+                    stats['nt_tiny'] += seq_len         
+                    tiny_lens.append(seq_len)
             elif max_filt and seq_len > max_len:
-                long_file.write(f'>{title}\n')
-                long_file.write(seq + '\n')
-                stats['seq_long'] += 1
-                stats['nt_long'] += seq_len         
-                long_lens.append(seq_len)
+                for header in title_split:               
+                    long_file.write(f'>{header}\n')
+                    long_file.write(seq + '\n')
+                    stats['seq_long'] += 1
+                    stats['nt_long'] += seq_len         
+                    long_lens.append(seq_len)
             else:
-                pass_file.write(f'>{title}\n')
-                pass_file.write(seq + '\n')
-                stats['seq_pass'] += 1
-                stats['nt_pass'] += seq_len
-                pass_lens.append(seq_len)
+                for header in title_split:                
+                    pass_file.write(f'>{header}\n')
+                    pass_file.write(seq + '\n')
+                    stats['seq_pass'] += 1
+                    stats['nt_pass'] += seq_len
+                    pass_lens.append(seq_len)
 
-    # Final statistics                                                    
+    # Statistics depending on length filters                                                   
     print(cyan(f' {i / 1e+6:.3g} Mseqs'), green('OK! '))
     nt_tot: int = stats['nt_pass'] + stats['nt_tiny'] + stats['nt_long']
     print(gray('\nPassed'), magenta(f'{stats["nt_pass"] / 1e+6:.3g}'),
@@ -315,7 +325,7 @@ def main():
             print(gray('Too long MAX length: '), f'{np.max(long_lens_np):n}')
         print('')
 
-    # Histogram
+    # Read length log-histogram
     print(gray('Generating log-histogram in file'), f'{output_plot}',
         gray('... '), end='')
     lens_np: np.ndarray = np.concatenate(
@@ -342,6 +352,20 @@ def main():
         plt.axvline(x=max_len, color='r')
     plt.savefig(output_plot)
     print(green('OK! '))
+    print('')
+
+    # Statistic for expanded headers
+    multiplicity_np = np.array(multiplicity)
+    if expand:
+        print(gray('Multiple headers in'), len(multiplicity),
+            gray('sequences'), magenta(f'({len(multiplicity)/i:.3%})'))
+        if multiplicity:
+            print(gray('Header MIN multiplicity: '), 
+                  f'{np.min(multiplicity_np):n}')
+            print(gray('Header AVG multiplicity: '),
+                   f'{np.average(multiplicity_np):.2g}')
+            print(gray('Header MAX multiplicity: '),
+                   f'{np.max(multiplicity_np):n}')
 
     # Timing results
     print(gray('\nTotal elapsed time:'), time.strftime(