diff --git a/CHANGES.md b/CHANGES.md index 6293d7489..bcfd674be 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -13,6 +13,8 @@ Releases are also tagged in git, if that's helpful. ## Coming up - Add workflow to check for new entries in CHANGES.md file +- new exception InvalidDocumentError to be raised when an error page is detected +- update mont parsing; and raise InvalidDocumentError ## Current diff --git a/juriscraper/lib/exceptions.py b/juriscraper/lib/exceptions.py index 488d0c994..fa08fc450 100644 --- a/juriscraper/lib/exceptions.py +++ b/juriscraper/lib/exceptions.py @@ -45,3 +45,12 @@ class PacerLoginException(Exception): def __init__(self, message): Exception.__init__(self, message) + + +class InvalidDocumentError(Exception): + """Raised when the document got from `download_url` is invalid + + May be an error page that is undetected by `response.raise_for_status` + or our `expected_content_type` controls. Proper place to raise this + would be on `Site.cleanup_content` + """ diff --git a/juriscraper/opinions/united_states/state/mont.py b/juriscraper/opinions/united_states/state/mont.py index 084a699c4..d91770054 100644 --- a/juriscraper/opinions/united_states/state/mont.py +++ b/juriscraper/opinions/united_states/state/mont.py @@ -4,14 +4,15 @@ import re +from juriscraper.lib.exceptions import InvalidDocumentError from juriscraper.OpinionSiteLinear import OpinionSiteLinear class Site(OpinionSiteLinear): base_url = "https://juddocumentservice.mt.gov" download_base = f"{base_url}/getDocByCTrackId?DocId=" - cite_regex = r"((19|20)\d{2}\sMT\s\d{1,3}[A-Z]?)" - + cite_regex = r"((19|20)\d{2}\sMT\s\d{1,3}[A-Z]?)" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.court_id = self.__module__ @@ -20,13 +21,15 @@ def __init__(self, *args, **kwargs): def _process_html(self): for row in self.html: - summary = row["documentDescription"] - if not summary.startswith("Opinion"): + description = row["documentDescription"] + if not description.startswith("Opinion"): # skip orders and just do opinions continue - - status = "Published" if "Published" in summary else "Unpublished" - + + status = ( + "Published" if "Published" in description else "Unpublished" + ) + docket = row["caseNumber"] if docket.startswith("DA"): nature = "Direct Appeal" @@ -39,14 +42,20 @@ def _process_html(self): else: nature = "Unknown" - # TODO: parse disp? - # Opinion - Noncite/Memorandum Chief Justice McGrath authored, Affirmed. - # Opinion - Noncite/Memorandum - Justice Baker - Affirmed and Remanded - # Opinion - Noncite/Memorandum; Justice Rice Author - Affirmed in part, reversed in part and remanded. - m = re.search( - r"Justice (?P.*?)\s*(?:author|,|-)", summary, re.I - ) - author = m.group("author") if m else "" + author = "" + disposition = "" + summary = "" + if author_match := re.search( + r"Justice (?P.*?)\s*(?:author(ed)?|,|-|\.)", + description, + re.I, + ): + author = author_match.group("author") + disposition = description[author_match.end() :].strip(" .,-") + disposition = disposition[:1].upper() + disposition[1:] + else: + summary = description + self.cases.append( { "url": self.download_base + row["cTrackId"], @@ -56,6 +65,8 @@ def _process_html(self): "docket": docket, "nature_of_suit": nature, "author": author, + "disposition": disposition, + "summary": summary, } ) @@ -70,6 +81,18 @@ def extract_from_text(self, scraped_text: str) -> dict: return {"Citation": match.group(0)} return {} - def cleanup_content(content: str): - """""" - pass \ No newline at end of file + @staticmethod + def cleanup_content(content: str) -> str: + """Raise an error if the content is invalid; otherwise just return it + + Not cleaning up in the common sense; but avoids ingesting error + pages. This source does not mark the error page with an error status + and does not have content type headers; so we can't detect the error + through standard controls + + :param content: the downloaded content + :return: the downloaded content, unchanged + """ + if "No document found with CTrack ID" in content[:1000]: + raise InvalidDocumentError(content) + return content diff --git a/tests/examples/opinions/united_states/mont_example.compare.json b/tests/examples/opinions/united_states/mont_example.compare.json index 13d9636a2..192722775 100644 --- a/tests/examples/opinions/united_states/mont_example.compare.json +++ b/tests/examples/opinions/united_states/mont_example.compare.json @@ -6,9 +6,10 @@ "precedential_statuses": "Unpublished", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed", "docket_numbers": "DA 17-0645", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Noncite/Memorandum - Chief Justice McGrath, affirmed.", + "summaries": "", "case_name_shorts": "", "authors": "McGrath" }, @@ -19,9 +20,10 @@ "precedential_statuses": "Published", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed", "docket_numbers": "DA 18-0603", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Published - Justice Sandefur - Affirmed", + "summaries": "", "case_name_shorts": "Speer", "authors": "Sandefur" }, @@ -32,9 +34,10 @@ "precedential_statuses": "Published", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Reversed and Remanded", "docket_numbers": "DA 19-0385", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Published - Justice Baker - Reversed and Remanded", + "summaries": "", "case_name_shorts": "", "authors": "Baker" }, @@ -45,9 +48,10 @@ "precedential_statuses": "Published", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed", "docket_numbers": "DA 19-0159", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Published - Justice Rice, affirmed.", + "summaries": "", "case_name_shorts": "Payne", "authors": "Rice" }, @@ -58,9 +62,10 @@ "precedential_statuses": "Unpublished", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed", "docket_numbers": "DA 19-0465", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Noncite/Memorandum Justice Ingrid Gustafson author -Affirmed", + "summaries": "", "case_name_shorts": "Marriage of Paschen", "authors": "Ingrid Gustafson" }, @@ -71,9 +76,10 @@ "precedential_statuses": "Published", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affrimed", "docket_numbers": "DA 19-0437", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Published - Justice Gustafson - Affrimed", + "summaries": "", "case_name_shorts": "Marriage of Lewis", "authors": "Gustafson" }, @@ -84,9 +90,10 @@ "precedential_statuses": "Unpublished", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed and Remanded", "docket_numbers": "DA 18-0387", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Noncite/Memorandum - Justice Baker - Affirmed and Remanded", + "summaries": "", "case_name_shorts": "", "authors": "Baker" }, @@ -97,9 +104,10 @@ "precedential_statuses": "Unpublished", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed", "docket_numbers": "DA 19-0291", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Noncite/Memorandum - Justice Gustafson - Affirmed", + "summaries": "", "case_name_shorts": "", "authors": "Gustafson" }, @@ -110,9 +118,10 @@ "precedential_statuses": "Unpublished", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed in part, reversed in part and remanded", "docket_numbers": "DA 19-0282", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Noncite/Memorandum; Justice Rice Author - Affirmed in part, reversed in part and remanded.", + "summaries": "", "case_name_shorts": "Marriage of Hamling", "authors": "Rice" }, @@ -123,9 +132,10 @@ "precedential_statuses": "Unpublished", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed", "docket_numbers": "DA 18-0616", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Noncite/Memorandum - Justice McKinnon, affirmed.", + "summaries": "", "case_name_shorts": "Given", "authors": "McKinnon" }, @@ -136,9 +146,10 @@ "precedential_statuses": "Unpublished", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed", "docket_numbers": "DA 18-0452", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Noncite/Memorandum Chief Justice McGrath authored, Affirmed.", + "summaries": "", "case_name_shorts": "", "authors": "McGrath" }, @@ -149,9 +160,10 @@ "precedential_statuses": "Unpublished", "blocked_statuses": false, "date_filed_is_approximate": false, + "dispositions": "Affirmed", "docket_numbers": "DA 18-0586", "nature_of_suit": "Direct Appeal", - "summaries": "Opinion - Noncite/Memorandum - Chief Justice Mike McGrath - Affirmed", + "summaries": "", "case_name_shorts": "Leuchtman", "authors": "Mike McGrath" }