Skip to content

Commit

Permalink
restructure project - files for data processing in cli
Browse files Browse the repository at this point in the history
  • Loading branch information
riasc committed Nov 17, 2024
1 parent e15e9c3 commit 5711c66
Show file tree
Hide file tree
Showing 10 changed files with 236 additions and 13 deletions.
28 changes: 28 additions & 0 deletions cli/include/BEDfileValidator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef GENOGROVE_BEDFILEVALIDATOR_HPP
#define GENOGROVE_BEDFILEVALIDATOR_HPP

// Class
#include "FileFormatValidator.hpp"

// Standard
#include <string>
#include <sstream>
#include <fstream>
#include <algorithm>

namespace genogrove {
class BEDfileValidator : public FileFormatValidator {
public:
BEDfileValidator();
bool validate(std::string& filepath) override;
std::string getErrorMessage() override;

bool isValidLine(std::string& line, int lineNum);
bool isValidInteger(std::string& str);

private:
std::string errorMessage;
};
}

#endif //GENOGROVE_BEDFILEVALIDATOR_HPP
16 changes: 16 additions & 0 deletions cli/include/FileFormatValidator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef GENOGROVE_FILEFORMATVALIDATOR_HPP
#define GENOGROVE_FILEFORMATVALIDATOR_HPP

#include <string>

namespace genogrove {
class FileFormatValidator {
public:
virtual bool validate(std::string& filepath) = 0;
virtual std::string getErrorMessage() = 0;
virtual ~FileFormatValidator() = default;

};
};

#endif //GENOGROVE_FILEFORMATVALIDATOR_HPP
29 changes: 29 additions & 0 deletions cli/include/FileTypeDetector.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#ifndef GENOGROVE_FILETYPEDETECTOR_HPP
#define GENOGROVE_FILETYPEDETECTOR_HPP

// Standard
#include <string>
#include <fstream>
#include <iostream>
#include <bitset>
#include <unordered_map>
#include <filesystem>

namespace genogrove {
enum class FileType {
BED,
BEDGRAPH,
GFF,
GTF,
VCF,
UNKNOWN
};

class FileTypeDetector {
public:
std::tuple<FileType, std::bitset<1>> detectFileType(const std::filesystem::path& filepath);
};
}


#endif //GENOGROVE_FILETYPEDETECTOR_HPP
33 changes: 33 additions & 0 deletions cli/include/FileValidatorFactory.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#ifndef GENOGROVE_FILEVALIDATOR_HPP
#define GENOGROVE_FILEVALIDATOR_HPP

// Standard
#include <filesystem>

// Class
#include "FileFormatValidator.hpp"
#include "FileTypeDetector.hpp"
#include "BEDfileValidator.hpp"

namespace genogrove {
class FileValidatorFactory {
public:
static bool validate(const std::filesystem::path& filepath,
FileType filetype,
std::bitset<1> gzipped,
std::string& errorMessage) {
std::unique_ptr<FileFormatValidator> validator;
switch(filetype) {
case FileType::BED:
validator = std::make_unique<BEDfileValidator>();
break;
default:
errorMessage = "Unsupported file type";
return false;
}
return true;
}
};
}

#endif //GENOGROVE_FILEVALIDATOR_HPP
1 change: 1 addition & 0 deletions cli/include/Index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
// Class
#include "genogrove/all.hpp"
#include "Subcall.hpp"
#include "FileValidatorFactory.hpp"

// CXXopts
#include "cxxopts.hpp"
Expand Down
54 changes: 54 additions & 0 deletions cli/src/BEDfileValidator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include "BEDfileValidator.hpp"

namespace genogrove {
BEDfileValidator::BEDfileValidator() : errorMessage("") {}

bool BEDfileValidator::validate(std::string& filepath) {
std::ifstream infile(filepath);
if(!infile.is_open()) {
errorMessage = "Failed to open file: " + filepath;
return false;
}

std::string line;
int lineNum = 1;
while(std::getline(infile, line)) {
if(!isValidLine(line, lineNum)) {
return false;
}
lineNum++;
}
return true;
}

std::string BEDfileValidator::getErrorMessage() {
return errorMessage;
}

bool BEDfileValidator::isValidLine(std::string& line, int lineNum) {
std::stringstream ss(line);
std::string chrom, start, end;

if(!(ss >> chrom >> start >> end)) { // check if first three columns are valid
errorMessage = "Invalid line in file (line " + std::to_string(lineNum) + "): " + line;
return false;
}

if(!isValidInteger(start) || !isValidInteger(end)) { // check if start and end are valid integers
errorMessage = "Invalid start or end coordinate in file (line " + std::to_string(lineNum) + "): " + line;
return false;
}

if(std::stoi(start) >= std::stoi(end)) { // check if start is less than or equal to end
errorMessage = "Start coordinate is greater than end coordinate in file (line " + std::to_string(lineNum) + "): " + line;
return false;
}
return true;
}

bool BEDfileValidator::isValidInteger(std::string& str) {
return std::all_of(str.begin(), str.end(), ::isdigit);
}
}


35 changes: 35 additions & 0 deletions cli/src/FileTypeDetector.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "FileTypeDetector.hpp"

namespace genogrove {

static const std::unordered_map<std::string, FileType> extensions = {
{".bed", FileType::BED},
{".bedgraph", FileType::BEDGRAPH},
{".gff", FileType::GFF},
{".gtf", FileType::GTF},
{".vcf", FileType::VCF}
};

std::tuple<FileType, std::bitset<1>> FileTypeDetector::detectFileType(const std::filesystem::path& filepath) {
std::ifstream file(filepath, std::ios::binary);
if(!file) { throw std::runtime_error("Failed to open file: " + filepath.string()); }

// check if the file is gzipped
char buffer[2];
file.read(buffer, 2);
std::bitset<1> gzipped(0);
if(file.gcount() == 2 && buffer[0] == 0x1f && buffer[1] == 0x8b) { gzipped = std::bitset<1>(1); }

// extract the file extension
std::string extension = filepath.extension().string();
if(gzipped == std::bitset<1>(1) && extension == ".gz") {
extension = filepath.stem().extension().string();
}

// look up the file type
auto it = extensions.find(extension);
FileType filetype = (it != extensions.end()) ? it->second : FileType::UNKNOWN;

return std::make_tuple(filetype, gzipped);
}
}
37 changes: 36 additions & 1 deletion cli/src/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ cxxopts::Options Index::parseArgs(int argc, char** argv) {
cxxopts::value<std::string>())
("o, outputfile", "Write the index to the specified file",
cxxopts::value<std::string>())
("k, order", "The order of the tree (default: 3)",
("k, order", "The order of the tree",
cxxopts::value<int>()->default_value("3"))
("h, help", "Print help")

Expand All @@ -18,7 +18,42 @@ cxxopts::Options Index::parseArgs(int argc, char** argv) {
return options;
}


void Index::execute(const cxxopts::ParseResult& args) {
std::cout << "Indexing file: " << args["inputfile"].as<std::string>() << std::endl;
genogrove::IBPTree tree(args["order"].as<int>());
genogrove::BEDfileValidator validator;

std::string inputfile = args["inputfile"].as<std::string>();
// detect the file type
auto [filetype, gzipped] = genogrove::FileTypeDetector().detectFileType(inputfile);
// validate the file
std::string errorMessage;
if(!genogrove::FileValidatorFactory::validate(inputfile, filetype, gzipped, errorMessage)) {
std::cerr << "Error validating file: " << errorMessage << "\n";
return;
}

std::cout << "Successfully validated file\n";






// if(!validator.validate(inputfile)) {
// std::cerr << validator.getErrorMessage() << std::endl;
// return;
// }
//
//







if(args.count("inputfile")) {
std::string inputfile = args["inputfile"].as<std::string>();
std::cout << "Indexing file: " << inputfile << std::endl;
Expand Down
6 changes: 4 additions & 2 deletions cli/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,14 @@ int main(int argc, char** argv) {
}

// parse additional options for the subcommand
cxxopts::Options subcallOptions = command->parseArgs(argc, argv);
cxxopts::ParseResult subcallArgs = subcallOptions.parse(argc, argv);
cxxopts::Options subcallOptions = command->parseArgs(argc -1, argv + 1);
cxxopts::ParseResult subcallArgs = subcallOptions.parse(argc -1, argv + 1);

if(subcallArgs.count("help")) {
std::cout << subcallOptions.help() << "\n";
return 0;
}

command->execute(subcallArgs);
}

10 changes: 0 additions & 10 deletions include/genogrove/DataFileValidator.hpp

This file was deleted.

0 comments on commit 5711c66

Please sign in to comment.