Skip to content

Commit

Permalink
feature/added-markdown: refatored, moved markdown to separate method
Browse files Browse the repository at this point in the history
  • Loading branch information
megabotan committed Jul 16, 2024
1 parent 566c467 commit 12e25af
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 35 deletions.
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ Main class of this library.

* * *

#### ScrapingAntClient.general_request and ScrapingAntClient.general_request_async
#### Common arguments
For:
- ScrapingAntClient.general_request
- ScrapingAntClient.general_request_async
- ScrapingAntClient.markdown_request
- ScrapingAntClient.markdown_request_async

https://docs.scrapingant.com/request-response-format#available-parameters

Expand All @@ -76,7 +81,6 @@ https://docs.scrapingant.com/request-response-format#available-parameters
| return_page_source | <code>boolean</code> | False |
| data | same as [requests param 'data'](https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests) | None |
| json | same as [requests param 'json'](https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests) | None |
| endpoint | None or 'markdown' | None |

**IMPORTANT NOTE:** <code>js_snippet</code> will be encoded to Base64 automatically by the ScrapingAnt client library.

Expand Down Expand Up @@ -275,11 +279,10 @@ from scrapingant_client import ScrapingAntClient
client = ScrapingAntClient(token='<YOUR-SCRAPINGANT-API-TOKEN>')

# Sending POST request with json data
result = client.general_request(
result = client.markdown_request(
url="https://example.com",
endpoint='markdown',
)
print(result.text)
print(result.markdown)
```

## Useful links
Expand Down
77 changes: 47 additions & 30 deletions scrapingant_client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
)
from scrapingant_client.headers import convert_headers
from scrapingant_client.proxy_type import ProxyType
from scrapingant_client.response import Response
from scrapingant_client.response import Response, MarkdownResponse
from scrapingant_client.utils import base64_encode_string


Expand All @@ -43,7 +43,8 @@ def _form_payload(
browser: bool = True,
return_page_source: Optional[bool] = None,
) -> Dict:
request_data = {'url': url}
request_data = {
'url': url}
if cookies is not None:
request_data['cookies'] = cookies_list_to_string(cookies)
if js_snippet is not None:
Expand All @@ -60,7 +61,7 @@ def _form_payload(
request_data['return_page_source'] = return_page_source
return request_data

def _parse_response(self, response_status_code: int, response_data: Dict, url: str, endpoint: str) -> Response:
def _check_status_code(self, response_status_code: int, response_data: Dict, url: str) -> None:
if response_status_code == 403:
raise ScrapingantInvalidTokenException()
elif response_status_code == 404:
Expand All @@ -71,25 +72,25 @@ def _parse_response(self, response_status_code: int, response_data: Dict, url: s
raise ScrapingantDetectedException()
elif response_status_code == 500:
raise ScrapingantInternalException()
if endpoint is None or endpoint == 'extended':
content = response_data['html']
cookies_string = response_data['cookies']
text = response_data['text']
status_code = response_data['status_code']
cookies_list = cookies_list_from_string(cookies_string)
return Response(
content=content,
cookies=cookies_list,
text=text,
status_code=status_code
)
elif endpoint == 'markdown':
return Response(
content='',
cookies=[],
text=response_data['markdown'],
status_code=0,
)

def _parse_extended_response(self, response_data: Dict) -> Response:
content = response_data['html']
cookies_string = response_data['cookies']
text = response_data['text']
status_code = response_data['status_code']
cookies_list = cookies_list_from_string(cookies_string)
return Response(
content=content,
cookies=cookies_list,
text=text,
status_code=status_code,
)

def _parse_markdown_response(self, response_data: Dict) -> MarkdownResponse:
return MarkdownResponse(
url=response_data['url'],
markdown=response_data['markdown'],
)

def _get_scrapingant_api_url(self, endpoint: Optional[str] = None) -> str:
if endpoint is None or endpoint == 'extended':
Expand All @@ -99,7 +100,7 @@ def _get_scrapingant_api_url(self, endpoint: Optional[str] = None) -> str:
else:
raise ValueError(f'Invalid endpoint: {endpoint}, must be either None or "markdown"')

def general_request(
def _request(
self,
url: str,
method: str = 'GET',
Expand All @@ -114,7 +115,7 @@ def general_request(
data=None,
json=None,
endpoint: Optional[str] = None,
) -> Response:
) -> Dict:
request_data = self._form_payload(
url=url,
cookies=cookies,
Expand All @@ -138,10 +139,10 @@ def general_request(
raise ScrapingantTimeoutException()
response_status_code = response.status_code
response_data = response.json()
parsed_response: Response = self._parse_response(response_status_code, response_data, url, endpoint)
return parsed_response
self._check_status_code(response_status_code, response_data, url)
return response_data

async def general_request_async(
async def _request_async(
self,
url: str,
method: str = 'GET',
Expand All @@ -156,7 +157,7 @@ async def general_request_async(
data=None,
json=None,
endpoint: Optional[str] = None,
) -> Response:
) -> Dict:
import httpx

request_data = self._form_payload(
Expand Down Expand Up @@ -189,5 +190,21 @@ async def general_request_async(
raise ScrapingantTimeoutException()
response_status_code = response.status_code
response_data = response.json()
parsed_response: Response = self._parse_response(response_status_code, response_data, url, endpoint)
return parsed_response
self._check_status_code(response_status_code, response_data, url)
return response_data

def general_request(self, *args, **kwargs) -> Response:
response_data = self._request(*args, **kwargs, endpoint='extended')
return self._parse_extended_response(response_data)

async def general_request_async(self, *args, **kwargs) -> Response:
response_data = await self._request_async(*args, **kwargs, endpoint='extended')
return self._parse_extended_response(response_data)

def markdown_request(self, *args, **kwargs) -> MarkdownResponse:
response_data = self._request(*args, **kwargs, endpoint='markdown')
return self._parse_markdown_response(response_data)

async def markdown_request_async(self, *args, **kwargs) -> MarkdownResponse:
response_data = await self._request_async(*args, **kwargs, endpoint='markdown')
return self._parse_markdown_response(response_data)
6 changes: 6 additions & 0 deletions scrapingant_client/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,9 @@ def __init__(self, content: str, cookies: List[Cookie], text: str, status_code:
self.cookies = cookies
self.text = text
self.status_code = status_code


class MarkdownResponse:
def __init__(self, url: str, markdown: str):
self.url = url
self.markdown = markdown

0 comments on commit 12e25af

Please sign in to comment.