dfusionai · RicLaiZA · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024 · Dec 20, 2024
diff --git a/psl_proof/__main__.py b/psl_proof/__main__.py
@@ -19,6 +19,7 @@ def load_config() -> Dict[str, Any]:
         'input_dir': INPUT_DIR,
         'salt': '5EkntCWI',
         'validator_base_api_url': 'https://api.vana.genesis.dfusion.ai'
+        #'validator_base_api_url': 'https://d2a2-169-0-167-27.ngrok-free.app'
     }
     logging.info(f"Using config: {json.dumps(config, indent=2)}")
     return config

diff --git a/psl_proof/models/cargo_data.py b/psl_proof/models/cargo_data.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 
-from psl_proof.models.submission_dtos import SubmissionChat
+from psl_proof.models.submission_dtos import ChatHistory, SubmissionChat
 
 # Enum for DataSource
 class DataSource(Enum):
@@ -102,7 +102,7 @@ class SourceData:
     user: str
     submission_token: str
     submission_date: datetime
-    proof_token: str 
+    proof_token: str
     source_chats: List[SourceChatData]  # List of SourceChatData instances
 
     def __init__(self, source, submission_token, submission_date, user, source_chats=None, proof_token=None, ):
@@ -167,6 +167,7 @@ class CargoData:
     source_id: str
     current_timestamp: datetime = None
     last_submission: datetime = None
+    chat_histories: List[ChatHistory] = field(default_factory=list)
     chat_list: List[SubmissionChat] = field(default_factory=list)
     # chat_list: List[ChatData] = field(default_factory=list)
     total_quality = 0.0
@@ -183,8 +184,8 @@ def to_dict(self):
         # Return a dictionary representation of the CargoData object
         return {
             "source_data": self.source_data,  # Assuming source_data can be serialized directly
-            "source_id": self.source_id #,
-            #"chat_list": [chat.to_dict() for chat in self.chat_list]  # Convert each ChatData in the list to a dict
+            "source_id": self.source_id,
+            "chat_list": get_chat_list_data()
         }
 
     @staticmethod

diff --git a/psl_proof/proof.py b/psl_proof/proof.py
@@ -139,8 +139,8 @@ def generate(self) -> ProofResponse:
             'did_score_content': True,
             'source': source_data.source.name,
             'revision': data_revision,
-            'submitted_on': current_timestamp.isoformat()
-            #'chat_data': None #RL: No longer generate useful data...
+            'submitted_on': current_timestamp.isoformat(),
+            'chat_data': cargo_data.get_chat_list_data()
         }
         self.proof_response.metadata = metadata
 
@@ -220,7 +220,7 @@ def get_source_data(
 
     source_data = SourceData(
         source=input_source,
-        user=input_user,
+        user = input_user,
         submission_token = submission_token,
         submission_date = submission_timestamp
     )

diff --git a/psl_proof/utils/feature_extraction.py b/psl_proof/utils/feature_extraction.py
@@ -0,0 +1,26 @@
+from keybert import KeyBERT
+from transformers import pipeline
+
+def get_keywords_keybert(chats):
+    kw_model = KeyBERT(model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
+    keywords = kw_model.extract_keywords(chats)
+    return keywords
+
+def get_sentiment_data(chats):
+    #Patrick_ToCheck this model do not work...
+    #sentiment_analyzer = pipeline("sentiment-analysis", model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
+    sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
+
+    messages = chats.split(">") #TODO use real way to split out different messages
+    #TODO: make sure no single message is too long for classification, can break it up if length too long
+    sentiments = sentiment_analyzer(messages)
+    category_scores = {"positive": 0, "neutral": 0, "negative": 0}
+    category_counts = {"positive": 0, "neutral": 0, "negative": 0}
+    for result in sentiments:
+        label = result['label'].lower()
+        category_scores[label] += result['score']
+        category_counts[label] += 1
+    # Normalize scores by dividing by the total number of messages
+    total_messages = len(messages)
+    normalized_scores = {key: (category_scores[key] / total_messages) for key in category_scores}
+    return normalized_scores
diff --git a/psl_proof/utils/validate_data.py b/psl_proof/utils/validate_data.py
@@ -1,9 +1,12 @@
+import math
+
 from psl_proof.models.cargo_data import CargoData, ChatData, SourceChatData, SourceData
 from psl_proof.models.proof_response import ProofResponse
 
 from typing import List, Dict, Any
 from psl_proof.models.submission_dtos import ChatHistory, SubmissionChat, ChatHistory, SubmissionHistory
-import math
+from psl_proof.utils.feature_extraction import get_keywords_keybert, get_sentiment_data
+
 
 def get_total_score(quality, uniqueness)-> float:
     #total_score = quality # Since uniqueness always 1
@@ -87,6 +90,11 @@ def get_uniqueness_score(
                     return 0.0
                 return 1.0 # unique
 
+                #if time_in_hours <= 24: # within 24 Hours..
+                #    print(f"time_in_hours:{time_in_hours}")
+                #    time_decay = math.log(2) / 12   #half_life: 12hrs, more recent less scores...
+                #    return math.exp(-time_decay * (24 - time_in_hours))
+
     # If no matching source_chat_id is found, return 1
     return 1.0
 
@@ -130,14 +138,22 @@ def validate_data(
                 cargo_data.total_uniqueness  += uniqueness
 
             #print(f"source_contents: {source_contents}")
-            #RL: No longer generate data for sentiment & keywords
-            # Create a ChatData instance and add it to the list
-            #chat_data = ChatData(
-            #    chat_length=contents_length,
-            #    chat_start_on = source_chat.chat_start_on,
-            #    chat_ended_on = source_chat.chat_ended_on
-            #)
+
+            chat_sentiment = get_sentiment_data(
+                source_contents
+            )
+            chat_keywords = get_keywords_keybert(
+                source_contents
+            )
+
+            chat_data = ChatData(
+                chat_length=contents_length,
+                chat_start_on = source_chat.chat_start_on,
+                chat_ended_on = source_chat.chat_ended_on,
+                sentiment = chat_sentiment,
+                keywords = chat_keywords
+            )
             #print(f"chat_data: {chat_data}")
-            #cargo_data.chat_list.append(
-            #    chat_data
-            #)
+            cargo_data.chat_list.append(
+                chat_data
+            )
diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,5 @@ requests==2.32.3
 typing_extensions==4.12.2
 urllib3==2.2.3
 xxhash==3.5.0
+keybert==0.8.5
+transformers==4.47.0