Skip to content

Commit

Permalink
Merge branch 'main' into 1322-fix-cafc-oral-arg
Browse files Browse the repository at this point in the history
  • Loading branch information
grossir authored Feb 25, 2025
2 parents 987f08f + 2fa847a commit c654714
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 25 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ Releases are also tagged in git, if that's helpful.

- Fixes:
- fix cafc oral argument scraper PR (#1325)[https://github.com/freelawproject/juriscraper/pull/1325]
- ignore future date sanity check when date filed is approximate #1321
- new exception InvalidDocumentError to be raised when an error page is detected
- update mont parsing; and raise InvalidDocumentError



## Current

Expand Down
11 changes: 10 additions & 1 deletion juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,17 @@ def _check_sanity(self):
case_date = fixed_date
self.case_dates[index] = fixed_date

# If a date is approximate, then it may be set in the future until
# half of the year has passed. Ignore this case
if hasattr(self, "date_filed_is_approximate"):
date_is_approximate = self.date_filed_is_approximate[index]
else:
date_is_approximate = False

# dates should not be in the future. Tolerate a week
if case_date > (date.today() + timedelta(days=7)):
if not date_is_approximate and case_date > (
date.today() + timedelta(days=7)
):
future_date_count += 1
error = f"{self.court_id}: {case_date} date is in the future. Case '{case_name}'"
logger.error(error)
Expand Down
9 changes: 9 additions & 0 deletions juriscraper/lib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,12 @@ class PacerLoginException(Exception):

def __init__(self, message):
Exception.__init__(self, message)


class InvalidDocumentError(Exception):
"""Raised when the document got from `download_url` is invalid
May be an error page that is undetected by `response.raise_for_status`
or our `expected_content_type` controls. Proper place to raise this
would be on `Site.cleanup_content`
"""
57 changes: 45 additions & 12 deletions juriscraper/opinions/united_states/state/mont.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,32 @@

import re

from juriscraper.lib.exceptions import InvalidDocumentError
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
base_url = "https://juddocumentservice.mt.gov"
download_base = f"{base_url}/getDocByCTrackId?DocId="
cite_regex = r"((19|20)\d{2}\sMT\s\d{1,3}[A-Z]?)"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.base = "https://juddocumentservice.mt.gov"
self.url = f"{self.base}/getDailyOrders"
self.download_base = f"{self.base}/getDocByCTrackId?DocId="
self.url = f"{self.base_url}/getDailyOrders"
self.expected_content_types = None
self.cite_regex = r"((19|20)\d{2}\sMT\s\d{1,3}[A-Z]?)"

def _process_html(self):
for row in self.html:
summary = row["documentDescription"]
if not summary.startswith("Opinion"):
description = row["documentDescription"]
if not description.startswith("Opinion"):
# skip orders and just do opinions
continue
status = "Published" if "Published" in summary else "Unpublished"

status = (
"Published" if "Published" in description else "Unpublished"
)

docket = row["caseNumber"]
if docket.startswith("DA"):
nature = "Direct Appeal"
Expand All @@ -36,20 +42,31 @@ def _process_html(self):
else:
nature = "Unknown"

m = re.search(
r"Justice (?P<author>.*?)\s*(?:author|,|-)", summary, re.I
)
author = m.group("author") if m else ""
author = ""
disposition = ""
summary = ""
if author_match := re.search(
r"Justice (?P<author>.*?)\s*(?:author(ed)?|,|-|\.)",
description,
re.I,
):
author = author_match.group("author")
disposition = description[author_match.end() :].strip(" .,-")
disposition = disposition[:1].upper() + disposition[1:]
else:
summary = description

self.cases.append(
{
"url": self.download_base + row["cTrackId"],
"status": status,
"date": row["fileDate"],
"name": row["title"],
"docket": docket,
"summary": summary,
"nature_of_suit": nature,
"author": author,
"disposition": disposition,
"summary": summary,
}
)

Expand All @@ -63,3 +80,19 @@ def extract_from_text(self, scraped_text: str) -> dict:
if match := re.search(self.cite_regex, first_text):
return {"Citation": match.group(0)}
return {}

@staticmethod
def cleanup_content(content: str) -> str:
"""Raise an error if the content is invalid; otherwise just return it
Not cleaning up in the common sense; but avoids ingesting error
pages. This source does not mark the error page with an error status
and does not have content type headers; so we can't detect the error
through standard controls
:param content: the downloaded content
:return: the downloaded content, unchanged
"""
if "No document found with CTrack ID" in content[:1000]:
raise InvalidDocumentError(content)
return content
36 changes: 24 additions & 12 deletions tests/examples/opinions/united_states/mont_example.compare.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed",
"docket_numbers": "DA 17-0645",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Noncite/Memorandum - Chief Justice McGrath, affirmed.",
"summaries": "",
"case_name_shorts": "",
"authors": "McGrath"
},
Expand All @@ -19,9 +20,10 @@
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed",
"docket_numbers": "DA 18-0603",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Published - Justice Sandefur - Affirmed",
"summaries": "",
"case_name_shorts": "Speer",
"authors": "Sandefur"
},
Expand All @@ -32,9 +34,10 @@
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Reversed and Remanded",
"docket_numbers": "DA 19-0385",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Published - Justice Baker - Reversed and Remanded",
"summaries": "",
"case_name_shorts": "",
"authors": "Baker"
},
Expand All @@ -45,9 +48,10 @@
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed",
"docket_numbers": "DA 19-0159",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Published - Justice Rice, affirmed.",
"summaries": "",
"case_name_shorts": "Payne",
"authors": "Rice"
},
Expand All @@ -58,9 +62,10 @@
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed",
"docket_numbers": "DA 19-0465",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Noncite/Memorandum Justice Ingrid Gustafson author -Affirmed",
"summaries": "",
"case_name_shorts": "Marriage of Paschen",
"authors": "Ingrid Gustafson"
},
Expand All @@ -71,9 +76,10 @@
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affrimed",
"docket_numbers": "DA 19-0437",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Published - Justice Gustafson - Affrimed",
"summaries": "",
"case_name_shorts": "Marriage of Lewis",
"authors": "Gustafson"
},
Expand All @@ -84,9 +90,10 @@
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed and Remanded",
"docket_numbers": "DA 18-0387",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Noncite/Memorandum - Justice Baker - Affirmed and Remanded",
"summaries": "",
"case_name_shorts": "",
"authors": "Baker"
},
Expand All @@ -97,9 +104,10 @@
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed",
"docket_numbers": "DA 19-0291",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Noncite/Memorandum - Justice Gustafson - Affirmed",
"summaries": "",
"case_name_shorts": "",
"authors": "Gustafson"
},
Expand All @@ -110,9 +118,10 @@
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed in part, reversed in part and remanded",
"docket_numbers": "DA 19-0282",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Noncite/Memorandum; Justice Rice Author - Affirmed in part, reversed in part and remanded.",
"summaries": "",
"case_name_shorts": "Marriage of Hamling",
"authors": "Rice"
},
Expand All @@ -123,9 +132,10 @@
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed",
"docket_numbers": "DA 18-0616",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Noncite/Memorandum - Justice McKinnon, affirmed.",
"summaries": "",
"case_name_shorts": "Given",
"authors": "McKinnon"
},
Expand All @@ -136,9 +146,10 @@
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed",
"docket_numbers": "DA 18-0452",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Noncite/Memorandum Chief Justice McGrath authored, Affirmed.",
"summaries": "",
"case_name_shorts": "",
"authors": "McGrath"
},
Expand All @@ -149,9 +160,10 @@
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"dispositions": "Affirmed",
"docket_numbers": "DA 18-0586",
"nature_of_suit": "Direct Appeal",
"summaries": "Opinion - Noncite/Memorandum - Chief Justice Mike McGrath - Affirmed",
"summaries": "",
"case_name_shorts": "Leuchtman",
"authors": "Mike McGrath"
}
Expand Down

0 comments on commit c654714

Please sign in to comment.