Skip to content

Commit

Permalink
version 0.1
Browse files Browse the repository at this point in the history
SWERIKS check
  • Loading branch information
salgo60 committed Nov 12, 2024
1 parent 9b155f7 commit 389ba4e
Show file tree
Hide file tree
Showing 2 changed files with 395 additions and 0 deletions.
6 changes: 6 additions & 0 deletions Notebook/.ipynb_checkpoints/SWERIKS check-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
389 changes: 389 additions & 0 deletions Notebook/SWERIKS check.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,389 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6bf94f08-39fc-47f8-b310-b7ff0a1c77fd",
"metadata": {},
"source": [
"### Linkroot WD SWERIK\n",
"See https://github.com/swerik-project/riksdagen-persons/issues/27#issuecomment-2456456383\n",
"\n",
"check SWERIKS linkroot - [P12192](https://www.wikidata.org/wiki/Property:P12192)\n",
"\n",
"version 0.1 has progressbar"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "da850da8-c182-4cf6-ae82-5734b2c25c95",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Last run: 2024-11-12 10:09:02.380817\n"
]
}
],
"source": [
"from datetime import datetime\n",
"start_time = datetime.now()\n",
"print(\"Last run: \", datetime.now())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "73aca08c-58ca-46ad-b922-c3fb10169645",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: wikibaseintegrator in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.12.8)\n",
"Requirement already satisfied: backoff<3.0.0,>=2.2.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (2.2.1)\n",
"Requirement already satisfied: mwoauth<0.5.0,>=0.4.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (0.4.0)\n",
"Requirement already satisfied: oauthlib<4.0.0,>=3.2.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (3.2.2)\n",
"Requirement already satisfied: requests<3.0.0,>=2.31.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (2.31.0)\n",
"Requirement already satisfied: requests-oauthlib<3.0.0,>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (2.0.0)\n",
"Requirement already satisfied: ujson<6.0.0,>=5.9.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from wikibaseintegrator) (5.10.0)\n",
"Requirement already satisfied: PyJWT>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from mwoauth<0.5.0,>=0.4.0->wikibaseintegrator) (2.9.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->wikibaseintegrator) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->wikibaseintegrator) (3.6)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->wikibaseintegrator) (1.26.18)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->wikibaseintegrator) (2024.2.2)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Library/Frameworks/Python.framework/Versions/3.12/bin/python3.12 -m pip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [
"import sys\n",
"!{sys.executable} -m pip install wikibaseintegrator"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "973666b1-3552-482e-b4ac-66cfd141ab6d",
"metadata": {},
"outputs": [],
"source": [
"from wikibaseintegrator.wbi_helpers import execute_sparql_query\n",
"from wikibaseintegrator import WikibaseIntegrator \n",
"from wikibaseintegrator.wbi_config import config as wbi_config"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5a198701-5776-4cef-96b3-e8b803446d11",
"metadata": {},
"outputs": [],
"source": [
"wbi_config['USER_AGENT'] = 'WikibaseIntegrator in PAWS by salgo60'\n",
"wbi = WikibaseIntegrator()\n",
"results = execute_sparql_query(\"\"\"\n",
"SELECT ?wd ?swerik WHERE {\n",
" ?wd wdt:P12192 ?swerik.\n",
"} \n",
"\"\"\")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ccdece9f-5f9a-451e-b1d1-8767c4b23160",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 6177 results\n"
]
}
],
"source": [
"bindings = results[\"results\"][\"bindings\"]\n",
"print(f\"Found {len(bindings)} results\")\n",
"count = 1\n",
"#global NrnotValid \n",
"NrnotValid = 0\n",
"#global NrValid\n",
"NrValid = 0"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a5832d52-5f28-4dab-b2aa-487c421c2a3a",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"\n",
"def checkurl(wd, swerik):\n",
" global NrValid\n",
" global NrnotValid\n",
" base_url = f\"https://swerik-project.github.io/person-catalog/{swerik}\"\n",
" try:\n",
" response = requests.get(base_url)\n",
" if response.status_code == 200:\n",
" NrValid += 1\n",
" else:\n",
" NrnotValid += 1\n",
" print(f\"WD {wd} - {base_url}\")\n",
" except requests.exceptions.RequestException as e:\n",
" NrnotValid += 1\n",
" print(f\"WD {wd} - {base_url}\")\n",
" print(f\"Error: {e}\")\n",
" return False, f\"WD {wd} - {base_url} - Error: {e}\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c8917366-acbc-4c7e-a159-e4d858b27b91",
"metadata": {},
"outputs": [],
"source": [
"# pip install tqdm\n",
"from tqdm.notebook import tqdm \n",
"from time import sleep\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b97e742-71be-4274-8bfc-a2d6b6112f95",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 12%|██▎ | 713/6177 [03:05<28:31, 3.19it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q4934552 - https://swerik-project.github.io/person-catalog/i-PCZrYEHwPaEeNTZphEsWTv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 15%|██▉ | 905/6177 [04:05<28:28, 3.09it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q4957371 - https://swerik-project.github.io/person-catalog/i-31gPpUoSm7zqzQckVmfPGy\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 16%|███▏ | 995/6177 [04:33<27:36, 3.13it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q4970175 - https://swerik-project.github.io/person-catalog/i-UX4D3JJdrTjFBf2zyfHx5t\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 19%|███▌ | 1159/6177 [05:25<25:59, 3.22it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q4976825 - https://swerik-project.github.io/person-catalog/i-NvxzaU2RSok83zCskNAuhg\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 38%|███████▏ | 2352/6177 [11:45<19:55, 3.20it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q97971262 - https://swerik-project.github.io/person-catalog/i-RH6VCPhyxs9yYcfXJzPxYT\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 38%|███████▎ | 2359/6177 [11:46<18:29, 3.44it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q97971276 - https://swerik-project.github.io/person-catalog/i-Cdgsqn4Ts9WMwbjXcE4537\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 38%|███████▎ | 2377/6177 [11:52<19:45, 3.21it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q98271639 - https://swerik-project.github.io/person-catalog/i-x1CuoKmRHYgQr9i2kh3B5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 39%|███████▎ | 2388/6177 [11:55<19:42, 3.20it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q98538839 - https://swerik-project.github.io/person-catalog/i-TUyWWYGDFXW92GhiG3CLwF\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 42%|████████ | 2612/6177 [13:06<15:19, 3.88it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q98937434 - https://swerik-project.github.io/person-catalog/i-EzcxskgMAVbnq8hM2F2km9\n",
"WD Q98937482 - https://swerik-project.github.io/person-catalog/i-HYFwSCrwnemwyJTLMcyqvN\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 59%|███████████▏ | 3625/6177 [18:31<11:41, 3.64it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WD Q117223085 - https://swerik-project.github.io/person-catalog/i-EQM2NLR1fbN9izUQhjTRGR\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing records: 60%|███████████▍ | 3710/6177 [18:58<13:09, 3.13it/s]"
]
}
],
"source": [
"# List to store errors\n",
"errors = []\n",
"for result in tqdm(bindings, \n",
" total=len(bindings), \n",
" desc=\"Processing records\"):\n",
" #print (result)\n",
"\n",
" swerik = result[\"swerik\"][\"value\"]\n",
" wdurl = result[\"wd\"][\"value\"]\n",
" wd = str(wdurl).replace(\"http://www.wikidata.org/entity/\",\"\")\n",
" try:\n",
" success, error_message = checkurl(wd, swerik)\n",
" if not success and error_message:\n",
" errors.append(error_message)\n",
" except Exception as e:\n",
" # Store the error details in the list\n",
" errors.append((wd, swerik, str(e)))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9d7be57-8e8e-4670-a031-bf7124906aac",
"metadata": {},
"outputs": [],
"source": [
"# Print the results\n",
"print(f\"Number of valid URLs: {NrValid}\")\n",
"print(f\"Number of invalid URLs: {NrnotValid}\")\n",
"\n",
"if errors:\n",
" print(\"\\nErrors encountered:\")\n",
" for wd, swerik, error_msg in errors:\n",
" print(f\"Error with wd: {wd}, swerik: {swerik} - {error_msg}\")\n",
"else:\n",
" print(\"\\nAll records processed without errors.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "125dabca-1841-4ebc-857e-730d07ab8b34",
"metadata": {},
"outputs": [],
"source": [
"print(\"End run: \", datetime.now())\n",
"print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 389ba4e

Please sign in to comment.