Skip to content

Commit

Permalink
Extract validation command and update fpcommand description
Browse files Browse the repository at this point in the history
  • Loading branch information
Dhwaniartefact committed Jan 21, 2025
1 parent 97d7ce2 commit bfd8dfc
Show file tree
Hide file tree
Showing 9 changed files with 268 additions and 444 deletions.
90 changes: 90 additions & 0 deletions src/archivematicaCommon/lib/archivematicaFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,3 +562,93 @@ def get_oidc_secondary_providers(
}

return providers


def jhove_validation_command(module):
NEW_JHOVE_VALIDATION_CMD = r"""
import json
import subprocess
import sys
from lxml import etree
class JhoveException(Exception):
pass
def parse_jhove_data(target):
args = ['jhove', '-h', 'xml', '-m', '%s', target]
try:
output = subprocess.check_output(args).decode("utf8")
except subprocess.CalledProcessError:
raise JhoveException("Jhove failed when running: " + ' '.join(args))
return etree.fromstring(output.encode("utf8"))
def get_status(doc):
status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status')
if status is None:
raise JhoveException("Unable to find status!")
return status.text
def get_outcome(status, format=None):
# JHOVE returns "bytestream" for unrecognized file formats.
# That can include unrecognized or malformed PDFs, JPEG2000s, etc.
# Since we're whitelisting the formats we're passing in,
# "bytestream" indicates that the format is not in fact well-formed
# regardless of what the status reads.
if format == "bytestream":
return "partial pass"
if status == "Well-Formed and valid":
return "pass"
elif status == "Well-Formed, but not valid":
return "partial pass"
else:
return "fail"
def get_format(doc):
format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format')
version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version')
if format is None:
format = "Not detected"
else:
format = format.text
if version is not None:
version = version.text
return (format, version)
def format_event_outcome_detail_note(format, version, result):
note = 'format="{}";'.format(format)
if version is not None:
note = note + ' version="{}";'.format(version)
note = note + ' result="{}"'.format(result)
return note
def main(target):
try:
doc = parse_jhove_data(target)
status = get_status(doc)
format, version = get_format(doc)
outcome = get_outcome(status, format)
note = format_event_outcome_detail_note(format, version, status)
out = {
"eventOutcomeInformation": outcome,
"eventOutcomeDetailNote": note
}
print(json.dumps(out))
return 0
except JhoveException as e:
return e
if __name__ == '__main__':
target = sys.argv[1]
sys.exit(main(target))
"""
return NEW_JHOVE_VALIDATION_CMD % module
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from archivematicaFunctions import jhove_validation_command
from django.db import migrations

JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735"
Expand Down Expand Up @@ -35,98 +36,17 @@
"a0f916de-ed95-4f2a-9f6d-0cbfd8949cc2",
"f4074907-c111-4e6c-91ae-9c0526475a9a",
)
NEW_JHOVE_VALIDATION_CMD = r"""
import json
import subprocess
import sys

from lxml import etree
class JhoveException(Exception):
pass
def parse_jhove_data(target):
args = ['jhove', '-h', 'xml', '-m', 'PDF-hul', target]
try:
output = subprocess.check_output(args).decode("utf8")
except subprocess.CalledProcessError:
raise JhoveException("Jhove failed when running: " + ' '.join(args))
return etree.fromstring(output.encode("utf8"))
def get_status(doc):
status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status')
if status is None:
raise JhoveException("Unable to find status!")
return status.text
def get_outcome(status, format=None):
# JHOVE returns "bytestream" for unrecognized file formats.
# That can include unrecognized or malformed PDFs, JPEG2000s, etc.
# Since we're whitelisting the formats we're passing in,
# "bytestream" indicates that the format is not in fact well-formed
# regardless of what the status reads.
if format == "bytestream":
return "partial pass"
if status == "Well-Formed and valid":
return "pass"
elif status == "Well-Formed, but not valid":
return "partial pass"
else:
return "fail"
def get_format(doc):
format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format')
version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version')
if format is None:
format = "Not detected"
else:
format = format.text
if version is not None:
version = version.text
return (format, version)
def format_event_outcome_detail_note(format, version, result):
note = 'format="{}";'.format(format)
if version is not None:
note = note + ' version="{}";'.format(version)
note = note + ' result="{}"'.format(result)
return note
def main(target):
try:
doc = parse_jhove_data(target)
status = get_status(doc)
format, version = get_format(doc)
outcome = get_outcome(status, format)
note = format_event_outcome_detail_note(format, version, status)
out = {
"eventOutcomeInformation": outcome,
"eventOutcomeDetailNote": note
}
print(json.dumps(out))
return 0
except JhoveException as e:
return e
if __name__ == '__main__':
target = sys.argv[1]
sys.exit(main(target))
"""
module = "PDF-hul"


def data_migration_up(apps, schema_editor):
"""Update commands and rules."""
_add_fp_command_for_pdf_validation(
apps, NEW_JHOVE_CMD_ID, NEW_JHOVE_VALIDATION_CMD, JHOVE_VALIDATION_RULES_OF_PDF
apps,
NEW_JHOVE_CMD_ID,
jhove_validation_command(module),
JHOVE_VALIDATION_RULES_OF_PDF,
)


Expand All @@ -141,7 +61,7 @@ def _add_fp_command_for_pdf_validation(apps, new_cmd_uuid, new_cmd, rule_uuids):
command=new_cmd,
script_type="pythonScript",
command_usage="validation",
description="Validate PDF using JHOVE",
description="Validate using JHOVE PDF-hul",
)

# Update existing rules
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from archivematicaFunctions import jhove_validation_command
from django.db import migrations

JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735"
Expand All @@ -12,100 +13,22 @@
"913ff712-1856-48d7-85e9-415617fc9fdc",
"cc464095-02b3-471b-8f1d-221aecf37741",
"ab286afc-f429-4e50-8a40-452c6331d630",
"c5a30e3c-2100-4b5b-a9b5-27a236a345dd",
"56c72d8a-139b-4cdf-8dd0-d65a373301d2",
"62f0e3bd-a5bb-4fa0-b78b-dab15253b429",
"5df96ec2-b5a3-48b5-8599-3f292ff525c1",
)

NEW_JHOVE_VALIDATION_CMD = r"""
import json
import subprocess
import sys
from lxml import etree
class JhoveException(Exception):
pass
def parse_jhove_data(target):
args = ['jhove', '-h', 'xml', '-m', 'JPEG-hul', target]
try:
output = subprocess.check_output(args).decode("utf8")
except subprocess.CalledProcessError:
raise JhoveException("Jhove failed when running: " + ' '.join(args))
return etree.fromstring(output.encode("utf8"))
def get_status(doc):
status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status')
if status is None:
raise JhoveException("Unable to find status!")
return status.text
def get_outcome(status, format=None):
# JHOVE returns "bytestream" for unrecognized file formats.
# That can include unrecognized or malformed PDFs, JPEG2000s, etc.
# Since we're whitelisting the formats we're passing in,
# "bytestream" indicates that the format is not in fact well-formed
# regardless of what the status reads.
if format == "bytestream":
return "partial pass"
if status == "Well-Formed and valid":
return "pass"
elif status == "Well-Formed, but not valid":
return "partial pass"
else:
return "fail"
def get_format(doc):
format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format')
version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version')
if format is None:
format = "Not detected"
else:
format = format.text
if version is not None:
version = version.text
return (format, version)
def format_event_outcome_detail_note(format, version, result):
note = 'format="{}";'.format(format)
if version is not None:
note = note + ' version="{}";'.format(version)
note = note + ' result="{}"'.format(result)
return note
def main(target):
try:
doc = parse_jhove_data(target)
status = get_status(doc)
format, version = get_format(doc)
outcome = get_outcome(status, format)
note = format_event_outcome_detail_note(format, version, status)
out = {
"eventOutcomeInformation": outcome,
"eventOutcomeDetailNote": note
}
print(json.dumps(out))
return 0
except JhoveException as e:
return e
if __name__ == '__main__':
target = sys.argv[1]
sys.exit(main(target))
"""
module = "JPEG-hul"


def data_migration_up(apps, schema_editor):
"""Update commands and rules."""
_add_fp_command_for_jpeg_validation(
apps, NEW_JHOVE_CMD_ID, NEW_JHOVE_VALIDATION_CMD, JHOVE_VALIDATION_RULES_OF_JPEG
apps,
NEW_JHOVE_CMD_ID,
jhove_validation_command(module),
JHOVE_VALIDATION_RULES_OF_JPEG,
)


Expand All @@ -120,7 +43,7 @@ def _add_fp_command_for_jpeg_validation(apps, new_cmd_uuid, new_cmd, rule_uuids)
command=new_cmd,
script_type="pythonScript",
command_usage="validation",
description="Validate JPEG using JHOVE",
description="Validate using JHOVE JPEG-hul",
)

# Update existing rules
Expand Down
Loading

0 comments on commit bfd8dfc

Please sign in to comment.