Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integration with AI Chat Result. #10

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions psl_proof/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def load_config() -> Dict[str, Any]:
'input_dir': INPUT_DIR,
'salt': '5EkntCWI',
'validator_base_api_url': 'https://api.vana.genesis.dfusion.ai'
#'validator_base_api_url': 'https://d2a2-169-0-167-27.ngrok-free.app'
}
logging.info(f"Using config: {json.dumps(config, indent=2)}")
return config
Expand Down
9 changes: 5 additions & 4 deletions psl_proof/models/cargo_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dataclasses import dataclass, field
from datetime import datetime, timezone

from psl_proof.models.submission_dtos import SubmissionChat
from psl_proof.models.submission_dtos import ChatHistory, SubmissionChat

# Enum for DataSource
class DataSource(Enum):
Expand Down Expand Up @@ -102,7 +102,7 @@ class SourceData:
user: str
submission_token: str
submission_date: datetime
proof_token: str
proof_token: str
source_chats: List[SourceChatData] # List of SourceChatData instances

def __init__(self, source, submission_token, submission_date, user, source_chats=None, proof_token=None, ):
Expand Down Expand Up @@ -167,6 +167,7 @@ class CargoData:
source_id: str
current_timestamp: datetime = None
last_submission: datetime = None
chat_histories: List[ChatHistory] = field(default_factory=list)
chat_list: List[SubmissionChat] = field(default_factory=list)
# chat_list: List[ChatData] = field(default_factory=list)
total_quality = 0.0
Expand All @@ -183,8 +184,8 @@ def to_dict(self):
# Return a dictionary representation of the CargoData object
return {
"source_data": self.source_data, # Assuming source_data can be serialized directly
"source_id": self.source_id #,
#"chat_list": [chat.to_dict() for chat in self.chat_list] # Convert each ChatData in the list to a dict
"source_id": self.source_id,
"chat_list": get_chat_list_data()
}

@staticmethod
Expand Down
6 changes: 3 additions & 3 deletions psl_proof/proof.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ def generate(self) -> ProofResponse:
'did_score_content': True,
'source': source_data.source.name,
'revision': data_revision,
'submitted_on': current_timestamp.isoformat()
#'chat_data': None #RL: No longer generate useful data...
'submitted_on': current_timestamp.isoformat(),
'chat_data': cargo_data.get_chat_list_data()
}
self.proof_response.metadata = metadata

Expand Down Expand Up @@ -220,7 +220,7 @@ def get_source_data(

source_data = SourceData(
source=input_source,
user=input_user,
user = input_user,
submission_token = submission_token,
submission_date = submission_timestamp
)
Expand Down
26 changes: 26 additions & 0 deletions psl_proof/utils/feature_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from keybert import KeyBERT
from transformers import pipeline

def get_keywords_keybert(chats):
kw_model = KeyBERT(model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
keywords = kw_model.extract_keywords(chats)
return keywords

def get_sentiment_data(chats):
#Patrick_ToCheck this model do not work...
#sentiment_analyzer = pipeline("sentiment-analysis", model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")

messages = chats.split(">") #TODO use real way to split out different messages
#TODO: make sure no single message is too long for classification, can break it up if length too long
sentiments = sentiment_analyzer(messages)
category_scores = {"positive": 0, "neutral": 0, "negative": 0}
category_counts = {"positive": 0, "neutral": 0, "negative": 0}
for result in sentiments:
label = result['label'].lower()
category_scores[label] += result['score']
category_counts[label] += 1
# Normalize scores by dividing by the total number of messages
total_messages = len(messages)
normalized_scores = {key: (category_scores[key] / total_messages) for key in category_scores}
return normalized_scores
38 changes: 27 additions & 11 deletions psl_proof/utils/validate_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import math

from psl_proof.models.cargo_data import CargoData, ChatData, SourceChatData, SourceData
from psl_proof.models.proof_response import ProofResponse

from typing import List, Dict, Any
from psl_proof.models.submission_dtos import ChatHistory, SubmissionChat, ChatHistory, SubmissionHistory
import math
from psl_proof.utils.feature_extraction import get_keywords_keybert, get_sentiment_data


def get_total_score(quality, uniqueness)-> float:
#total_score = quality # Since uniqueness always 1
Expand Down Expand Up @@ -87,6 +90,11 @@ def get_uniqueness_score(
return 0.0
return 1.0 # unique

#if time_in_hours <= 24: # within 24 Hours..
# print(f"time_in_hours:{time_in_hours}")
# time_decay = math.log(2) / 12 #half_life: 12hrs, more recent less scores...
# return math.exp(-time_decay * (24 - time_in_hours))

# If no matching source_chat_id is found, return 1
return 1.0

Expand Down Expand Up @@ -130,14 +138,22 @@ def validate_data(
cargo_data.total_uniqueness += uniqueness

#print(f"source_contents: {source_contents}")
#RL: No longer generate data for sentiment & keywords
# Create a ChatData instance and add it to the list
#chat_data = ChatData(
# chat_length=contents_length,
# chat_start_on = source_chat.chat_start_on,
# chat_ended_on = source_chat.chat_ended_on
#)

chat_sentiment = get_sentiment_data(
source_contents
)
chat_keywords = get_keywords_keybert(
source_contents
)

chat_data = ChatData(
chat_length=contents_length,
chat_start_on = source_chat.chat_start_on,
chat_ended_on = source_chat.chat_ended_on,
sentiment = chat_sentiment,
keywords = chat_keywords
)
#print(f"chat_data: {chat_data}")
#cargo_data.chat_list.append(
# chat_data
#)
cargo_data.chat_list.append(
chat_data
)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ requests==2.32.3
typing_extensions==4.12.2
urllib3==2.2.3
xxhash==3.5.0
keybert==0.8.5
transformers==4.47.0