Skip to content

Commit

Permalink
Merge pull request #1765 from ghutchis/v3000-mol-support
Browse files Browse the repository at this point in the history
Add basic support for v3000 molfiles, including for large molecules
  • Loading branch information
ghutchis authored Nov 9, 2024
2 parents 687e701 + e815396 commit a4831c0
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 7 deletions.
213 changes: 206 additions & 7 deletions avogadro/io/mdlformat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ using Avogadro::Core::Bond;
using Avogadro::Core::Elements;
using Avogadro::Core::lexicalCast;
using Avogadro::Core::Molecule;
using Avogadro::Core::split;
using Avogadro::Core::startsWith;
using Avogadro::Core::trimmed;

Expand Down Expand Up @@ -92,8 +93,10 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
return false;
}
string mdlVersion(trimmed(buffer.substr(33)));
if (mdlVersion != "V2000") {
appendError("Unsupported file format version encountered: " + mdlVersion);
if (mdlVersion == "V3000")
return readV3000(in, mol);
else if (mdlVersion != "V2000") {
appendError("Unsupported MDL version: " + mdlVersion);
return false;
}

Expand Down Expand Up @@ -241,8 +244,8 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
dataValue += buffer;
}
} else if (startsWith(buffer, "> <")) {
// This is a data header, read the name of the entry, and the value on the
// following lines.
// This is a data header, read the name of the entry, and the value on
// the following lines.
dataName = trimmed(buffer).substr(3, buffer.length() - 4);
inValue = true;
}
Expand All @@ -251,11 +254,207 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
return true;
}

bool MdlFormat::readV3000(std::istream& in, Core::Molecule& mol)
{
string buffer;
// we should have M V30 BEGIN CTAB
getline(in, buffer);
if (trimmed(buffer) != "M V30 BEGIN CTAB") {
appendError("Error parsing V3000 file, expected 'M V30 BEGIN CTAB'.");
return false;
}
// now we should get the counts line
// e.g. 'M V30 COUNTS 23694 24297 0 0 1'
getline(in, buffer);
// split by whitespace
std::vector<string> counts = split(trimmed(buffer), ' ');
if (counts.size() < 5) {
appendError("Error parsing V3000 counts line.");
return false;
}
bool ok(false);
int numAtoms(lexicalCast<int>(counts[3], ok));
if (!ok) {
appendError("Error parsing number of atoms.");
return false;
}
int numBonds(lexicalCast<int>(counts[4], ok));
if (!ok) {
appendError("Error parsing number of bonds.");
return false;
}

// Parse the atom block.
// 'M V30 BEGIN ATOM'
// 'M V30 1 N 171.646 251.874 224.877 0'
getline(in, buffer);
if (trimmed(buffer) != "M V30 BEGIN ATOM") {
appendError("Error parsing V3000 atom block.");
return false;
}
for (int i = 0; i < numAtoms; ++i) {
getline(in, buffer);
std::vector<string> atomData = split(trimmed(buffer), ' ');
if (atomData.size() < 7) {
appendError("Error parsing V3000 atom line.");
return false;
}

string element(trimmed(atomData[3]));
unsigned char atomicNum = Elements::atomicNumberFromSymbol(element);
Atom newAtom = mol.addAtom(atomicNum);

Vector3 pos;
pos.x() = lexicalCast<Real>(atomData[4], ok);
if (!ok) {
appendError("Failed to parse x coordinate: " + atomData[3]);
return false;
}
pos.y() = lexicalCast<Real>(atomData[5], ok);
if (!ok) {
appendError("Failed to parse y coordinate: " + atomData[4]);
return false;
}
pos.z() = lexicalCast<Real>(atomData[6], ok);
if (!ok) {
appendError("Failed to parse z coordinate: " + atomData[5]);
return false;
}
newAtom.setPosition3d(pos);
// check for formal charge in the atom block
// CHG=1 for example
if (atomData.size() > 8) {
string chargeData = atomData[8];
if (startsWith(chargeData, "CHG=")) {
int charge = lexicalCast<int>(chargeData.substr(4), ok);
if (!ok) {
appendError("Failed to parse atom charge: " + chargeData);
return false;
}
newAtom.setFormalCharge(charge);
}
}
} // end of atom block
getline(in, buffer);
// check for END ATOM
if (trimmed(buffer) != "M V30 END ATOM") {
appendError("Error parsing V3000 atom block.");
return false;
}

// bond block
// 'M V30 BEGIN BOND'
// 'M V30 1 1 1 2'
getline(in, buffer);
if (trimmed(buffer) != "M V30 BEGIN BOND") {
appendError("Error parsing V3000 bond block.");
return false;
}
for (int i = 0; i < numBonds; ++i) {
getline(in, buffer);
std::vector<string> bondData = split(trimmed(buffer), ' ');
if (bondData.size() < 5) {
appendError("Error parsing V3000 bond line.");
return false;
}
int order = lexicalCast<int>(bondData[3], ok);
if (!ok) {
appendError("Failed to parse bond order: " + bondData[3]);
return false;
}
int atom1 = lexicalCast<int>(bondData[4], ok) - 1;
if (!ok) {
appendError("Failed to parse bond atom1: " + bondData[4]);
return false;
}
int atom2 = lexicalCast<int>(bondData[5], ok) - 1;
if (!ok) {
appendError("Failed to parse bond atom2: " + bondData[5]);
return false;
}
mol.addBond(mol.atom(atom1), mol.atom(atom2),
static_cast<unsigned char>(order));
} // end of bond block

// look for M END
while (getline(in, buffer)) {
if (trimmed(buffer) == "M END")
break;
}
// read in any properties
while (getline(in, buffer)) {
if (startsWith(buffer, "> <")) {
string key = trimmed(buffer.substr(3, buffer.length() - 4));
string value;
while (getline(in, buffer)) {
if (trimmed(buffer) == "")
break;
value += buffer + "\n";
}
mol.setData(key, value);
}
}

return true;
}

bool MdlFormat::writeV3000(std::ostream& out, const Core::Molecule& mol)
{
// write the "fake" counts line
out << " 0 0 0 0 0 999 V3000\n";
out << "M V30 BEGIN CTAB\n";
out << "M V30 COUNTS " << mol.atomCount() << ' ' << mol.bondCount()
<< " 0 0 0\n";
// atom block
out << "M V30 BEGIN ATOM\n";
for (size_t i = 0; i < mol.atomCount(); ++i) {
Atom atom = mol.atom(i);
out << "M V30 " << i + 1 << ' ' << Elements::symbol(atom.atomicNumber())
<< ' ' << atom.position3d().x() << ' ' << atom.position3d().y() << ' '
<< atom.position3d().z() << " 0";
if (atom.formalCharge())
out << " CHG=" << atom.formalCharge();
out << "\n";
}
out << "M V30 END ATOM\n";
// bond block
out << "M V30 BEGIN BOND\n";
for (size_t i = 0; i < mol.bondCount(); ++i) {
Bond bond = mol.bond(i);
out << "M V30 " << i + 1 << ' ' << static_cast<int>(bond.order()) << ' '
<< (bond.atom1().index() + 1) << ' ' << (bond.atom2().index() + 1)
<< " \n";
}
out << "M V30 END BOND\n";
out << "M V30 END CTAB\n";
out << "M END\n";

// TODO: isotopes, radicals, etc.
if (m_writeProperties) {
const auto dataMap = mol.dataMap();
for (const auto& key : dataMap.names()) {
out << "> <" << key << ">\n";
out << dataMap.value(key).toString() << "\n";
out << "\n"; // empty line between data blocks
}
}

if (m_writeProperties || isMode(FileFormat::MultiMolecule))
out << "$$$$\n";

return true;
}

bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
{
// Header lines.
out << mol.data("name").toString() << "\n Avogadro\n\n";
// Counts line.
if (mol.atomCount() > 999 || mol.bondCount() > 999) {
// we need V3000 support for big molecules
return writeV3000(out, mol);
}

out << setw(3) << std::right << mol.atomCount() << setw(3) << mol.bondCount()
<< " 0 0 0 0 0 0 0 0999 V2000\n";
// Atom block.
Expand All @@ -269,7 +468,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
: ((charge <= 3) ? charge : 0);
out << setw(10) << std::right << std::fixed << setprecision(4)
<< atom.position3d().x() << setw(10) << atom.position3d().y()
<< setw(10) << atom.position3d().z() << " " << setw(3) << std::left
<< setw(10) << atom.position3d().z() << ' ' << setw(3) << std::left
<< Elements::symbol(atom.atomicNumber()) << " 0" << setw(3)
<< std::right << chargeField /* for compatibility */
<< " 0 0 0 0 0 0 0 0 0 0\n";
Expand All @@ -286,7 +485,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
for (auto& i : chargeList) {
Index atomIndex = i.first;
signed int atomCharge = i.second;
out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << " "
out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << ' '
<< setw(3) << atomCharge << "\n";
}
// TODO: isotopes, etc.
Expand All @@ -301,7 +500,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
}
}

if (isMode(FileFormat::MultiMolecule))
if (m_writeProperties || isMode(FileFormat::MultiMolecule))
out << "$$$$\n";

return true;
Expand Down
2 changes: 2 additions & 0 deletions avogadro/io/mdlformat.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ class AVOGADROIO_EXPORT MdlFormat : public FileFormat
std::vector<std::string> mimeTypes() const override;

bool read(std::istream& in, Core::Molecule& molecule) override;
bool readV3000(std::istream& in, Core::Molecule& molecule);
bool write(std::ostream& out, const Core::Molecule& molecule) override;
bool writeV3000(std::ostream& out, const Core::Molecule& molecule);

protected:
bool m_writeProperties = false;
Expand Down
1 change: 1 addition & 0 deletions avogadro/io/sdfformat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ std::vector<std::string> SdfFormat::fileExtensions() const
{
std::vector<std::string> ext;
ext.emplace_back("sdf");
ext.emplace_back("sd3");
return ext;
}

Expand Down

0 comments on commit a4831c0

Please sign in to comment.