Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

News API bot #1

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 191 additions & 0 deletions news-api/feed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import os
import urllib.request
import json
from typing import Dict, Optional, List, Union
from urllib.parse import urlencode
from rdflib import Graph, Literal, Namespace, BNode, URIRef
from rdflib.resource import Resource
from rdflib.namespace import RDF, XSD
from openai import OpenAI

# Define Namespaces
SCHEMA = Namespace("http://schema.org/")

def fetch_data(api_key: str, query_params: Dict) -> Dict:
# Check if the API key is available
if not api_key:
raise ValueError("API key is not provided.")

# Add the API key to the query parameters
query_params['apiKey'] = api_key

# Encode query parameters
encoded_params = urlencode(query_params)

# Construct the full URL with the encoded query parameters
url = f"https://newsapi.org/v2/everything?{encoded_params}"

# Fetch and process the JSON data within a response context
with urllib.request.urlopen(url) as response:
# Load the JSON response
return json.loads(response.read().decode())

# Function to fetch and convert JSON to RDF, accepts query parameters dict and api_key
def to_graph(data: Dict) -> Graph:
# Create RDF Graph
g = Graph()

# Loop through articles and create RDFLib resources
for article_data in data['articles']:
# Create a blank node for the article
article = g.resource(BNode())

# Set article type as schema.org Article
article.set(RDF.type, SCHEMA.Article)

# Set properties on the RDFLib Resource using Pythonic key checks
if "title" in article_data:
article.set(SCHEMA.headline, Literal(article_data['title']))

if "description" in article_data:
article.set(SCHEMA.description, Literal(article_data['description']))

if "author" in article_data:
article.set(SCHEMA.author, Literal(article_data['author']))

if "url" in article_data:
article.set(SCHEMA.url, URIRef(article_data['url']))

if "publishedAt" in article_data:
article.set(SCHEMA.datePublished, Literal(article_data['publishedAt'], datatype=XSD.dateTime))

# Add source information
if "source" in article_data:
source_data = article_data['source']
if "name" in source_data:
source = g.resource(BNode())
source.set(RDF.type, SCHEMA.Organization)
source.set(SCHEMA.name, Literal(source_data['name']))

# Link the source (publisher) to the article
article.set(SCHEMA.publisher, source)

# Add content if available
if "content" in article_data:
article.set(SCHEMA.articleBody, Literal(article_data['content']))


# Return the graph in Turtle format
return g

def extract_named_entities(openai_api_key: str, description: str) -> Optional[List[Dict[str, Optional[str]]]]:
"""
Extract named entities from the given description using the ChatGPT API with the function_call mode.
Returns a list of dictionaries, each containing 'name' and 'uri' for the entities.
"""
client = OpenAI(
# This is the default and can be omitted
api_key=openai_api_key
)

chat_completion = client.chat.completions.create(
model="gpt-4-0613", # Ensure you're using the latest GPT model that supports function calls
messages=[
{
"role": "system",
"content": "You are an assistant that extracts named entities from text and links them to DBPedia or Wikidata."
},
{
"role": "user",
"content": f"Extract named entities from the following description and return the result as a JSON object with 'name' and 'uri' fields. If there is no URI, use null.\n\nDescription: {description}"
}
],
functions=[
{
"name": "extract_named_entities",
"description": "Extract named entities from text and link them to DBPedia or Wikidata URIs.",
"parameters": {
"type": "object",
"properties": {
"entities": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The name of the entity."
},
"uri": {
"type": ["string", "null"],
"description": "The DBPedia or Wikidata URI of the entity, or null if unavailable."
}
},
"required": ["name", "uri"]
}
}
},
"required": ["entities"]
}
}
],
function_call={"name": "extract_named_entities"}, # This triggers the function return in JSON
temperature=0
)

# Get the function response
if 'choices' in response and len(response['choices']) > 0:
function_call = response['choices'][0].get('message', {}).get('function_call', {})
if function_call and 'arguments' in function_call:
# Parse the JSON response from the function_call
arguments = json.loads(function_call['arguments'])
return arguments.get('entities', [])

return None

# Type hint for an entity dictionary, with 'name' and 'uri' fields
Entity = Dict[str, Optional[str]]

def link_entities_with_article(g: Graph, article: Resource, entities: List[Entity]) -> None:
for entity in entities:
entity_name: Optional[str] = entity.get('name')
entity_uri: Optional[str] = entity.get('uri')

if entity_uri:
entity_resource = g.resource(URIRef(entity_uri))
entity_resource.set(RDF.type, SCHEMA.Thing)

if entity_name:
entity_resource.set(SCHEMA.name, Literal(entity_name))

article.set(SCHEMA.about, entity_resource)


# Example of calling the method with query parameters and API key
query_params = {
"q": "sparql",
"from": "2024-09-05",
"sortBy": "popularity"
}

news_api_key = os.getenv("NEWS_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

news_json = fetch_data(news_api_key, query_params)
articles_graph = to_graph(news_json)

# Iterate over all subjects in the graph that are of type schema:Article
for article_subject in articles_graph.subjects(RDF.type, SCHEMA.Article):
# Cast each article subject to an RDFLib Resource
article_resource = Resource(articles_graph, article_subject)

# Example: Linking extracted entities to this article
description = article_resource.value(SCHEMA.description)
if description:
entities = extract_named_entities(openai_api_key, description)
if entities:
link_entities_with_article(articles_graph, article, entities)


# Print the RDF output in Turtle format
print(articles_graph.serialize(format='turtle'))