-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SWERIKS check
- Loading branch information
Showing
2 changed files
with
395 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"cells": [], | ||
"metadata": {}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,389 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "6bf94f08-39fc-47f8-b310-b7ff0a1c77fd", | ||
"metadata": {}, | ||
"source": [ | ||
"### Linkroot WD SWERIK\n", | ||
"See https://github.com/swerik-project/riksdagen-persons/issues/27#issuecomment-2456456383\n", | ||
"\n", | ||
"check SWERIKS linkroot - [P12192](https://www.wikidata.org/wiki/Property:P12192)\n", | ||
"\n", | ||
"version 0.1 has progressbar" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "da850da8-c182-4cf6-ae82-5734b2c25c95", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Last run: 2024-11-12 10:09:02.380817\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from datetime import datetime\n", | ||
"start_time = datetime.now()\n", | ||
"print(\"Last run: \", datetime.now())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "73aca08c-58ca-46ad-b922-c3fb10169645", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Requirement already satisfied: wikibaseintegrator in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.12.8)\n", | ||
"Requirement already satisfied: backoff<3.0.0,>=2.2.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (2.2.1)\n", | ||
"Requirement already satisfied: mwoauth<0.5.0,>=0.4.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (0.4.0)\n", | ||
"Requirement already satisfied: oauthlib<4.0.0,>=3.2.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (3.2.2)\n", | ||
"Requirement already satisfied: requests<3.0.0,>=2.31.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (2.31.0)\n", | ||
"Requirement already satisfied: requests-oauthlib<3.0.0,>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (2.0.0)\n", | ||
"Requirement already satisfied: ujson<6.0.0,>=5.9.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (5.10.0)\n", | ||
"Requirement already satisfied: PyJWT>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from mwoauth<0.5.0,>=0.4.0->wikibaseintegrator) (2.9.0)\n", | ||
"Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->wikibaseintegrator) (3.3.2)\n", | ||
"Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->wikibaseintegrator) (3.6)\n", | ||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->wikibaseintegrator) (1.26.18)\n", | ||
"Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->wikibaseintegrator) (2024.2.2)\n", | ||
"\n", | ||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", | ||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Library/Frameworks/Python.framework/Versions/3.12/bin/python3.12 -m pip install --upgrade pip\u001b[0m\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import sys\n", | ||
"!{sys.executable} -m pip install wikibaseintegrator" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "973666b1-3552-482e-b4ac-66cfd141ab6d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from wikibaseintegrator.wbi_helpers import execute_sparql_query\n", | ||
"from wikibaseintegrator import WikibaseIntegrator \n", | ||
"from wikibaseintegrator.wbi_config import config as wbi_config" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "5a198701-5776-4cef-96b3-e8b803446d11", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"wbi_config['USER_AGENT'] = 'WikibaseIntegrator in PAWS by salgo60'\n", | ||
"wbi = WikibaseIntegrator()\n", | ||
"results = execute_sparql_query(\"\"\"\n", | ||
"SELECT ?wd ?swerik WHERE {\n", | ||
" ?wd wdt:P12192 ?swerik.\n", | ||
"} \n", | ||
"\"\"\")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "ccdece9f-5f9a-451e-b1d1-8767c4b23160", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Found 6177 results\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"bindings = results[\"results\"][\"bindings\"]\n", | ||
"print(f\"Found {len(bindings)} results\")\n", | ||
"count = 1\n", | ||
"#global NrnotValid \n", | ||
"NrnotValid = 0\n", | ||
"#global NrValid\n", | ||
"NrValid = 0" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "a5832d52-5f28-4dab-b2aa-487c421c2a3a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import requests\n", | ||
"\n", | ||
"def checkurl(wd, swerik):\n", | ||
" global NrValid\n", | ||
" global NrnotValid\n", | ||
" base_url = f\"https://swerik-project.github.io/person-catalog/{swerik}\"\n", | ||
" try:\n", | ||
" response = requests.get(base_url)\n", | ||
" if response.status_code == 200:\n", | ||
" NrValid += 1\n", | ||
" else:\n", | ||
" NrnotValid += 1\n", | ||
" print(f\"WD {wd} - {base_url}\")\n", | ||
" except requests.exceptions.RequestException as e:\n", | ||
" NrnotValid += 1\n", | ||
" print(f\"WD {wd} - {base_url}\")\n", | ||
" print(f\"Error: {e}\")\n", | ||
" return False, f\"WD {wd} - {base_url} - Error: {e}\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "c8917366-acbc-4c7e-a159-e4d858b27b91", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# pip install tqdm\n", | ||
"from tqdm.notebook import tqdm \n", | ||
"from time import sleep\n", | ||
"from tqdm import tqdm" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "1b97e742-71be-4274-8bfc-a2d6b6112f95", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 12%|██▎ | 713/6177 [03:05<28:31, 3.19it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q4934552 - https://swerik-project.github.io/person-catalog/i-PCZrYEHwPaEeNTZphEsWTv\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 15%|██▉ | 905/6177 [04:05<28:28, 3.09it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q4957371 - https://swerik-project.github.io/person-catalog/i-31gPpUoSm7zqzQckVmfPGy\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 16%|███▏ | 995/6177 [04:33<27:36, 3.13it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q4970175 - https://swerik-project.github.io/person-catalog/i-UX4D3JJdrTjFBf2zyfHx5t\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 19%|███▌ | 1159/6177 [05:25<25:59, 3.22it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q4976825 - https://swerik-project.github.io/person-catalog/i-NvxzaU2RSok83zCskNAuhg\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 38%|███████▏ | 2352/6177 [11:45<19:55, 3.20it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q97971262 - https://swerik-project.github.io/person-catalog/i-RH6VCPhyxs9yYcfXJzPxYT\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 38%|███████▎ | 2359/6177 [11:46<18:29, 3.44it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q97971276 - https://swerik-project.github.io/person-catalog/i-Cdgsqn4Ts9WMwbjXcE4537\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 38%|███████▎ | 2377/6177 [11:52<19:45, 3.21it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q98271639 - https://swerik-project.github.io/person-catalog/i-x1CuoKmRHYgQr9i2kh3B5\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 39%|███████▎ | 2388/6177 [11:55<19:42, 3.20it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q98538839 - https://swerik-project.github.io/person-catalog/i-TUyWWYGDFXW92GhiG3CLwF\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 42%|████████ | 2612/6177 [13:06<15:19, 3.88it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q98937434 - https://swerik-project.github.io/person-catalog/i-EzcxskgMAVbnq8hM2F2km9\n", | ||
"WD Q98937482 - https://swerik-project.github.io/person-catalog/i-HYFwSCrwnemwyJTLMcyqvN\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 59%|███████████▏ | 3625/6177 [18:31<11:41, 3.64it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WD Q117223085 - https://swerik-project.github.io/person-catalog/i-EQM2NLR1fbN9izUQhjTRGR\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing records: 60%|███████████▍ | 3710/6177 [18:58<13:09, 3.13it/s]" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# List to store errors\n", | ||
"errors = []\n", | ||
"for result in tqdm(bindings, \n", | ||
" total=len(bindings), \n", | ||
" desc=\"Processing records\"):\n", | ||
" #print (result)\n", | ||
"\n", | ||
" swerik = result[\"swerik\"][\"value\"]\n", | ||
" wdurl = result[\"wd\"][\"value\"]\n", | ||
" wd = str(wdurl).replace(\"http://www.wikidata.org/entity/\",\"\")\n", | ||
" try:\n", | ||
" success, error_message = checkurl(wd, swerik)\n", | ||
" if not success and error_message:\n", | ||
" errors.append(error_message)\n", | ||
" except Exception as e:\n", | ||
" # Store the error details in the list\n", | ||
" errors.append((wd, swerik, str(e)))\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b9d7be57-8e8e-4670-a031-bf7124906aac", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Print the results\n", | ||
"print(f\"Number of valid URLs: {NrValid}\")\n", | ||
"print(f\"Number of invalid URLs: {NrnotValid}\")\n", | ||
"\n", | ||
"if errors:\n", | ||
" print(\"\\nErrors encountered:\")\n", | ||
" for wd, swerik, error_msg in errors:\n", | ||
" print(f\"Error with wd: {wd}, swerik: {swerik} - {error_msg}\")\n", | ||
"else:\n", | ||
" print(\"\\nAll records processed without errors.\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "125dabca-1841-4ebc-857e-730d07ab8b34", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print(\"End run: \", datetime.now())\n", | ||
"print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |