From 6e0e2113c5ff9b8c7a970bdfa11942dc8ad15c32 Mon Sep 17 00:00:00 2001 From: Joe Kearney Date: Tue, 14 Nov 2023 18:38:38 +0100 Subject: [PATCH] [User reported bug fixes & dependancy updates] Min python version now 3.8 --- scrapeops_scrapy/__init__.py | 2 +- scrapeops_scrapy/core/error_logger.py | 147 +++++++++--------- .../validators/response_validator.py | 4 +- setup.py | 17 +- 4 files changed, 85 insertions(+), 85 deletions(-) diff --git a/scrapeops_scrapy/__init__.py b/scrapeops_scrapy/__init__.py index de51cef..9dd91d7 100644 --- a/scrapeops_scrapy/__init__.py +++ b/scrapeops_scrapy/__init__.py @@ -1 +1 @@ -__version__ = "0.5.3" \ No newline at end of file +__version__ = "0.5.4" \ No newline at end of file diff --git a/scrapeops_scrapy/core/error_logger.py b/scrapeops_scrapy/core/error_logger.py index f426761..272033a 100644 --- a/scrapeops_scrapy/core/error_logger.py +++ b/scrapeops_scrapy/core/error_logger.py @@ -134,79 +134,63 @@ def emit(self, record): try: if(record.levelname == "ERROR" or record.levelname == "WARNING" or record.levelname == "CRITICAL"): - - errorMessage = record.message - fileAndLine = record.pathname + ', line: ' + str(record.lineno) - dateTime = record.asctime - type = record.levelname - engine = record.name - - - #covering warnings/probableCause/traceback missing - traceback = 'No traceback available' - probableCause = '' - - if record.exc_text is not None: - traceback = record.exc_text - splitTraceback = traceback.split('\n') - probableCause = splitTraceback[len(splitTraceback) - 1] - - - #covering retrys - if("Gave up retrying <" in record.message): - - for retryError in self.retryErrors: - if(retryError in record.message): - method = record.message.split('<')[1].split(' ')[0] - errorMessage = "Error: Gave up retrying " + method + " request - " + retryError - fileAndLine = '' - probableCause = retryError - break - - # Deprecation Warnings - if "ScrapyDeprecationWarning:" in record.message and record.message[0] == "/": - splitString = record.message.split("ScrapyDeprecationWarning:") - errorMessage = "ScrapyDeprecationWarning: " + splitString[1] - probableCause = splitString[0] - - - # "Some Other Error Occurred" - if "Some other error occurred: " in record.message: - splitError = record.message.split(' /') - cleanError = splitError[0].split(">: ")[1] - errorMessage = "Some other error occurred: " + cleanError - probableCause = cleanError - traceback = record.message - - - # Convert Urls To Domains in Error Messages - urls = re.findall(r'(https?://[^\s]+)', errorMessage) - for url in urls: - domain = DomainNormalizer.get_domain(url) - errorMessage = errorMessage.replace(url, domain) - - - if errorMessage in self.log_dict: - self.log_dict[errorMessage]['count'] = self.log_dict[errorMessage]['count'] + 1 - else: - self.log_dict[errorMessage] = { - 'type': type, - 'engine': engine, - 'name': errorMessage, - 'count': 1, - 'traceback': traceback, - 'message' : probableCause, - 'filepath': fileAndLine, - 'dateTime': dateTime - } - - if(SOPSRequest.HIGH_FREQ_ACC == True): - - if(errorMessage in self.log_dict_cumulative): - self.log_dict_cumulative[errorMessage]['count'] = self.log_dict_cumulative[errorMessage]['count'] + 1 + + if hasattr(record, 'message'): + errorMessage = record.message + fileAndLine = record.pathname + ', line: ' + str(record.lineno) + dateTime = record.asctime + type = record.levelname + engine = record.name + + + #covering warnings/probableCause/traceback missing + traceback = 'No traceback available' + probableCause = '' + + if record.exc_text is not None: + traceback = record.exc_text + splitTraceback = traceback.split('\n') + probableCause = splitTraceback[len(splitTraceback) - 1] + + + #covering retrys + if("Gave up retrying <" in record.message): + + for retryError in self.retryErrors: + if(retryError in record.message): + method = record.message.split('<')[1].split(' ')[0] + errorMessage = "Error: Gave up retrying " + method + " request - " + retryError + fileAndLine = '' + probableCause = retryError + break + + # Deprecation Warnings + if "ScrapyDeprecationWarning:" in record.message and record.message[0] == "/": + splitString = record.message.split("ScrapyDeprecationWarning:") + errorMessage = "ScrapyDeprecationWarning: " + splitString[1] + probableCause = splitString[0] + + + # "Some Other Error Occurred" + if "Some other error occurred: " in record.message: + splitError = record.message.split(' /') + cleanError = splitError[0].split(">: ")[1] + errorMessage = "Some other error occurred: " + cleanError + probableCause = cleanError + traceback = record.message + + + # Convert Urls To Domains in Error Messages + urls = re.findall(r'(https?://[^\s]+)', errorMessage) + for url in urls: + domain = DomainNormalizer.get_domain(url) + errorMessage = errorMessage.replace(url, domain) + + + if errorMessage in self.log_dict: + self.log_dict[errorMessage]['count'] = self.log_dict[errorMessage]['count'] + 1 else: - - self.log_dict_cumulative[errorMessage] = { + self.log_dict[errorMessage] = { 'type': type, 'engine': engine, 'name': errorMessage, @@ -215,7 +199,24 @@ def emit(self, record): 'message' : probableCause, 'filepath': fileAndLine, 'dateTime': dateTime - } + } + + if(SOPSRequest.HIGH_FREQ_ACC == True): + + if(errorMessage in self.log_dict_cumulative): + self.log_dict_cumulative[errorMessage]['count'] = self.log_dict_cumulative[errorMessage]['count'] + 1 + else: + + self.log_dict_cumulative[errorMessage] = { + 'type': type, + 'engine': engine, + 'name': errorMessage, + 'count': 1, + 'traceback': traceback, + 'message' : probableCause, + 'filepath': fileAndLine, + 'dateTime': dateTime + } except Exception as e: logging.info('Error: Error in error logger') diff --git a/scrapeops_scrapy/validators/response_validator.py b/scrapeops_scrapy/validators/response_validator.py index 75b7529..ed4d8e9 100644 --- a/scrapeops_scrapy/validators/response_validator.py +++ b/scrapeops_scrapy/validators/response_validator.py @@ -110,9 +110,9 @@ def string_check(text, text_check, comparison, text_slice=None): @staticmethod def string_slice(text, text_slice): if text_slice.get('active'): - if text_slice.get('slice_type') == 'first': + if (text_slice.get('slice_type') == 'first') and (len(text) > 0): return text[:text_slice.get('slice_upper_threshold', len(text))] - if text_slice.get('slice_type') == 'last': + if (text_slice.get('slice_type') == 'last') and (len(text) > 0): return text[-text_slice.get('slice_lower_threshold', 0)] if text_slice.get('slice_type') == 'range': return text[text_slice.get('slice_lower_threshold', 0):text_slice.get('slice_upper_threshold', len(text))] diff --git a/setup.py b/setup.py index 8bb6385..02c2e28 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages -VERSION = '0.5.3' +VERSION = '0.5.4' DESCRIPTION = 'Scrapeops Scrapy SDK, is a monitoring tool for your Scrapy spiders.' setup(name='scrapeops_scrapy', @@ -14,23 +14,22 @@ url="https://github.com/ScrapeOps/scrapeops-scrapy-sdk", packages=find_packages(), install_requires=[ - "tld>=0.12.4", - "requests>=2.24.0", - "json5>=0.9.5", - "urllib3>=1.25.10", - "itemadapter>=0.4.0", + "tld>=0.13", + "requests>=2.31.0", + "json5>=0.9.13", + "urllib3>=2.1", + "itemadapter>=0.8.0", ], classifiers=[ "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Intended Audience :: Developers", ], - python_requires=">=3.6", + python_requires=">=3.8", ) \ No newline at end of file