From 9b452a1ba6fb07bb1cbf251bdafb03ae5f656dbb Mon Sep 17 00:00:00 2001 From: shirazos7 Date: Thu, 12 Dec 2024 15:53:12 +0100 Subject: [PATCH 1/3] adjusting the parser to be defusedxml and lxml for xslt transformation --- src/swmath2swh/staging_deposit_v2.py | 29 ++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/swmath2swh/staging_deposit_v2.py b/src/swmath2swh/staging_deposit_v2.py index b568b10..c8565cd 100644 --- a/src/swmath2swh/staging_deposit_v2.py +++ b/src/swmath2swh/staging_deposit_v2.py @@ -1,33 +1,47 @@ from swmath2swh.restApi_software_Json import process_metadata from swmath2swh.convertSoftware_from_json_toXml import convert_json_to_xml -import lxml.etree as ET +import defusedxml.ElementTree as DET # Using defusedxml for safe XML parsing +import lxml.etree as ET # Using lxml for XSLT transformations import pandas as pd import subprocess import time import tempfile import os import requests - +import re +# Load environment variables env = os.environ.copy() env['SWMATH_USER_DEPOSIT'] = os.getenv('SWMATH_USER_DEPOSIT') env['SWMATH_PWD_DEPOSIT'] = os.getenv('SWMATH_PWD_DEPOSIT') -env['SWMATH_PWD_DEPOSIT'] = os.getenv('SWMATH_PWD_DEPOSIT') -xsl_filename = '../xslt/software/xslt_SWH_deposit.xslt' +xsl_filename = '../../xslt/software/xslt_SWH_deposit.xslt' + +# Fetch XML data r = requests.get("https://oai.staging.mardi4nfdi.org/oai/OAIHandler?verb=GetRecord&metadataPrefix=codemeta&identifier=oai:swmath.org:4532") xml_str = r.content -dom = ET.fromstring(xml_str) + +# Parse the XML safely using defusedxml +dom_safe = DET.fromstring(xml_str) + +# Convert the defusedxml tree to a string so lxml can parse it +dom_str = DET.tostring(dom_safe) +dom = ET.fromstring(dom_str) # Convert to lxml's Element for XSLT processing + +# Load and apply the XSLT transformation xslt = ET.parse(xsl_filename) transform = ET.XSLT(xslt) newdom = transform(dom) formatted_newdom = ET.tostring(newdom, pretty_print=True, encoding='unicode') -print(xml_str) - # Add this before the write statement +formatted_newdom = re.sub(r'xmlns:ns\d+="[^"]+"', '', formatted_newdom) +formatted_newdom = re.sub(r'ns\d+:', 'codemeta:', formatted_newdom) +print(formatted_newdom) + # Write transformed XML to a temporary file with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.xml') as temp_file: temp_file.write(formatted_newdom) temp_filename = temp_file.name print(f"Temporary file created: {temp_filename}") + # Format current time for deposit status current_time = time.localtime() formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", current_time) @@ -46,4 +60,3 @@ "--metadata", temp_filename, "--format", "json" ]) - From 5f487dd70a585e511a8ffa8c8021815b8a146fea Mon Sep 17 00:00:00 2001 From: shirazos7 Date: Thu, 12 Dec 2024 16:01:08 +0100 Subject: [PATCH 2/3] adjusting the parser to be defusedxml and lxml for xslt transformation --- src/swmath2swh/staging_deposit_v2.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/swmath2swh/staging_deposit_v2.py b/src/swmath2swh/staging_deposit_v2.py index c8565cd..ec93ba4 100644 --- a/src/swmath2swh/staging_deposit_v2.py +++ b/src/swmath2swh/staging_deposit_v2.py @@ -20,17 +20,18 @@ r = requests.get("https://oai.staging.mardi4nfdi.org/oai/OAIHandler?verb=GetRecord&metadataPrefix=codemeta&identifier=oai:swmath.org:4532") xml_str = r.content -# Parse the XML safely using defusedxml -dom_safe = DET.fromstring(xml_str) +dom = DET.fromstring(xml_str) -# Convert the defusedxml tree to a string so lxml can parse it -dom_str = DET.tostring(dom_safe) -dom = ET.fromstring(dom_str) # Convert to lxml's Element for XSLT processing +# Convert the defusedxml-parsed XML to a string for lxml processing +dom_str = DET.tostring(dom, encoding='unicode') -# Load and apply the XSLT transformation +# Use lxml to parse the XML string for XSLT transformation +lxml_dom = ET.fromstring(dom_str) + +# Perform XSLT transformation using lxml xslt = ET.parse(xsl_filename) transform = ET.XSLT(xslt) -newdom = transform(dom) +newdom = transform(lxml_dom) formatted_newdom = ET.tostring(newdom, pretty_print=True, encoding='unicode') formatted_newdom = re.sub(r'xmlns:ns\d+="[^"]+"', '', formatted_newdom) formatted_newdom = re.sub(r'ns\d+:', 'codemeta:', formatted_newdom) From f3bb8693a2f21aaaea1fb8b849d705e73b852462 Mon Sep 17 00:00:00 2001 From: shirazos7 Date: Thu, 12 Dec 2024 16:07:45 +0100 Subject: [PATCH 3/3] fixing some bugs --- src/swmath2swh/staging_deposit_v2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/swmath2swh/staging_deposit_v2.py b/src/swmath2swh/staging_deposit_v2.py index ec93ba4..46e4b93 100644 --- a/src/swmath2swh/staging_deposit_v2.py +++ b/src/swmath2swh/staging_deposit_v2.py @@ -1,7 +1,8 @@ from swmath2swh.restApi_software_Json import process_metadata from swmath2swh.convertSoftware_from_json_toXml import convert_json_to_xml import defusedxml.ElementTree as DET # Using defusedxml for safe XML parsing -import lxml.etree as ET # Using lxml for XSLT transformations +from defusedxml.lxml import fromstring +import lxml.etree as ET import pandas as pd import subprocess import time @@ -26,7 +27,7 @@ dom_str = DET.tostring(dom, encoding='unicode') # Use lxml to parse the XML string for XSLT transformation -lxml_dom = ET.fromstring(dom_str) +lxml_dom = fromstring(dom_str) # Perform XSLT transformation using lxml xslt = ET.parse(xsl_filename)