diff --git a/psl_proof/__main__.py b/psl_proof/__main__.py index 61ade41..4e4e5f1 100644 --- a/psl_proof/__main__.py +++ b/psl_proof/__main__.py @@ -19,6 +19,7 @@ def load_config() -> Dict[str, Any]: 'input_dir': INPUT_DIR, 'salt': '5EkntCWI', 'validator_base_api_url': 'https://api.vana.genesis.dfusion.ai' + #'validator_base_api_url': 'https://d2a2-169-0-167-27.ngrok-free.app' } logging.info(f"Using config: {json.dumps(config, indent=2)}") return config diff --git a/psl_proof/models/cargo_data.py b/psl_proof/models/cargo_data.py index bfe56df..ab92ea1 100644 --- a/psl_proof/models/cargo_data.py +++ b/psl_proof/models/cargo_data.py @@ -7,7 +7,7 @@ from dataclasses import dataclass, field from datetime import datetime, timezone -from psl_proof.models.submission_dtos import SubmissionChat +from psl_proof.models.submission_dtos import ChatHistory, SubmissionChat # Enum for DataSource class DataSource(Enum): @@ -102,7 +102,7 @@ class SourceData: user: str submission_token: str submission_date: datetime - proof_token: str + proof_token: str source_chats: List[SourceChatData] # List of SourceChatData instances def __init__(self, source, submission_token, submission_date, user, source_chats=None, proof_token=None, ): @@ -167,6 +167,7 @@ class CargoData: source_id: str current_timestamp: datetime = None last_submission: datetime = None + chat_histories: List[ChatHistory] = field(default_factory=list) chat_list: List[SubmissionChat] = field(default_factory=list) # chat_list: List[ChatData] = field(default_factory=list) total_quality = 0.0 @@ -183,8 +184,8 @@ def to_dict(self): # Return a dictionary representation of the CargoData object return { "source_data": self.source_data, # Assuming source_data can be serialized directly - "source_id": self.source_id #, - #"chat_list": [chat.to_dict() for chat in self.chat_list] # Convert each ChatData in the list to a dict + "source_id": self.source_id, + "chat_list": get_chat_list_data() } @staticmethod diff --git a/psl_proof/proof.py b/psl_proof/proof.py index 59fad85..823df0e 100644 --- a/psl_proof/proof.py +++ b/psl_proof/proof.py @@ -139,8 +139,8 @@ def generate(self) -> ProofResponse: 'did_score_content': True, 'source': source_data.source.name, 'revision': data_revision, - 'submitted_on': current_timestamp.isoformat() - #'chat_data': None #RL: No longer generate useful data... + 'submitted_on': current_timestamp.isoformat(), + 'chat_data': cargo_data.get_chat_list_data() } self.proof_response.metadata = metadata @@ -220,7 +220,7 @@ def get_source_data( source_data = SourceData( source=input_source, - user=input_user, + user = input_user, submission_token = submission_token, submission_date = submission_timestamp ) diff --git a/psl_proof/utils/feature_extraction.py b/psl_proof/utils/feature_extraction.py new file mode 100644 index 0000000..76f1ef5 --- /dev/null +++ b/psl_proof/utils/feature_extraction.py @@ -0,0 +1,26 @@ +from keybert import KeyBERT +from transformers import pipeline + +def get_keywords_keybert(chats): + kw_model = KeyBERT(model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") + keywords = kw_model.extract_keywords(chats) + return keywords + +def get_sentiment_data(chats): + #Patrick_ToCheck this model do not work... + #sentiment_analyzer = pipeline("sentiment-analysis", model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") + sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual") + + messages = chats.split(">") #TODO use real way to split out different messages + #TODO: make sure no single message is too long for classification, can break it up if length too long + sentiments = sentiment_analyzer(messages) + category_scores = {"positive": 0, "neutral": 0, "negative": 0} + category_counts = {"positive": 0, "neutral": 0, "negative": 0} + for result in sentiments: + label = result['label'].lower() + category_scores[label] += result['score'] + category_counts[label] += 1 + # Normalize scores by dividing by the total number of messages + total_messages = len(messages) + normalized_scores = {key: (category_scores[key] / total_messages) for key in category_scores} + return normalized_scores diff --git a/psl_proof/utils/validate_data.py b/psl_proof/utils/validate_data.py index 06ca1fa..a380513 100644 --- a/psl_proof/utils/validate_data.py +++ b/psl_proof/utils/validate_data.py @@ -1,9 +1,12 @@ +import math + from psl_proof.models.cargo_data import CargoData, ChatData, SourceChatData, SourceData from psl_proof.models.proof_response import ProofResponse from typing import List, Dict, Any from psl_proof.models.submission_dtos import ChatHistory, SubmissionChat, ChatHistory, SubmissionHistory -import math +from psl_proof.utils.feature_extraction import get_keywords_keybert, get_sentiment_data + def get_total_score(quality, uniqueness)-> float: #total_score = quality # Since uniqueness always 1 @@ -87,6 +90,11 @@ def get_uniqueness_score( return 0.0 return 1.0 # unique + #if time_in_hours <= 24: # within 24 Hours.. + # print(f"time_in_hours:{time_in_hours}") + # time_decay = math.log(2) / 12 #half_life: 12hrs, more recent less scores... + # return math.exp(-time_decay * (24 - time_in_hours)) + # If no matching source_chat_id is found, return 1 return 1.0 @@ -130,14 +138,22 @@ def validate_data( cargo_data.total_uniqueness += uniqueness #print(f"source_contents: {source_contents}") - #RL: No longer generate data for sentiment & keywords - # Create a ChatData instance and add it to the list - #chat_data = ChatData( - # chat_length=contents_length, - # chat_start_on = source_chat.chat_start_on, - # chat_ended_on = source_chat.chat_ended_on - #) + + chat_sentiment = get_sentiment_data( + source_contents + ) + chat_keywords = get_keywords_keybert( + source_contents + ) + + chat_data = ChatData( + chat_length=contents_length, + chat_start_on = source_chat.chat_start_on, + chat_ended_on = source_chat.chat_ended_on, + sentiment = chat_sentiment, + keywords = chat_keywords + ) #print(f"chat_data: {chat_data}") - #cargo_data.chat_list.append( - # chat_data - #) + cargo_data.chat_list.append( + chat_data + ) diff --git a/requirements.txt b/requirements.txt index d654a46..49ee127 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,5 @@ requests==2.32.3 typing_extensions==4.12.2 urllib3==2.2.3 xxhash==3.5.0 +keybert==0.8.5 +transformers==4.47.0 \ No newline at end of file