adds whole project with docs

prash5t · Jun 10, 2022 · 468e0a7 · 468e0a7
commit 468e0a7
Show file tree

Hide file tree

Showing 199 changed files with 7,670 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,83 @@
+# Sentimento ![Logo](https://play-lh.googleusercontent.com/nAaDRtLZlshur9o3A2XS_K__4I8m_yZ0gvucECrZtGoEGq8NUWE0Zj1vsyjALBui2Q=w35) Mobile Application ![Logo](https://play-lh.googleusercontent.com/1FikpccbOFZsDc5k9x1OQegu8A53tYcY8dkk_neZiCuOcdxWjzUcF3QebE_E9UQNiW4=w40)
+
+
+**Sentimento**, a social media assisting platform with sentiment analysis feature is a
+mobile application built using Flutter for front end, Flask for back end and Multinomial
+Naive Bayes Theorem for the algorithmic part.
+
+[![Click here to download from Google Play](./report-documentation/doc-assets/googleplay.png)](https://play.google.com/store/apps/details?id=com.awarself.sentimento)
+
+
+Undergraduate Final Year Project of student **Prashant Ghimire** at
+[![University Logo](./report-documentation/doc-assets/lmu.png)](https://www.londonmet.ac.uk/)
+[![College Logo](./report-documentation/doc-assets/islington.png)](https://islington.edu.np/)
+
+## Read Documentation
+
+- [Final Report](./report-documentation/Final%20Report.pdf)
+- [Risk Identification and Assessment Document](./report-documentation/Risk%20Identification%20and%20Assessment%20Document.pdf)
+- [Software Requirement Specification](./report-documentation/Software%20Requirement%20Specification.pdf)
+- [User Manual](./report-documentation/User%20Manual-%20Sentimento.pdf)
+- [Weekly Task Information](./report-documentation/Weekly%20Task%20Information.pdf)
+
+
+## Tech used
+
+Sentimento uses a number of open source projects to work properly:
+
+- [Flutter] - Cross platform application development kit
+- [Flask] - Micro web framework written in Python
+- [MultinomialNB] - Probabilistic Classifier with discrete features
+- [NumPy] - Support for large, multi-dimensional arrays and matrices
+- [Tweepy] - For accessing Twitter API
+- [Google API Core] - For accessing YouTube API
+- [Vs Code] - Used this code editor for the project
+- [GitHub] - Used as version control
+- [Postman API Platform] - Used to test built API's
+
+
+## Installation
+
+**Sentimento** [application] can either be used by installing from Google Play
+#### **or**
+can be used by cloning this [repository]
+
+- To run the frontend application you are expected to have the [Flutter] setup ready on your system.
+
+- To run the backend application, install the python dependencies as listed in requirements file.
+
+```sh
+cd backend-flask  # to go inside backend dir
+virtualenv env    # to use separate environment to run 
+sudo pip install -r requirements.txt # to get packages
+source env/bin/activate # to start using environment
+flask run --port=80 # to start the backend application
+```
+
+Verify the deployment by navigating to your server address in
+your preferred browser.
+
+```sh
+127.0.0.1:80
+```
+
+## License
+
+Sentimento project as listed here in this [repository] is open source, but the [application] available on Google Play is licensed.
+
+### **Thank you**
+Please do email at [email protected] for any purposes.
+
+[//]: # (These are reference links used in the body of this note and get stripped out when the markdown processor does its job. There is no need to format nicely because it shouldn't be seen. Thanks SO - http://stackoverflow.com/questions/4823468/store-comments-in-markdown-syntax)
+
+   [Flutter]: <https://flutter.dev/>
+   [Flask]: <https://flutter.dev/>
+   [MultinomialNB]: <https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html>
+   [NumPy]: <https://pypi.org/project/numpy/>
+   [Tweepy]: <https://pypi.org/project/tweepy/>
+   [Google API Core]: <https://pypi.org/project/google-api-core/>
+   [VS Code]: <https://code.visualstudio.com/>
+   [GitHub]: <https://github.com/>
+   [Postman API Platform]: <https://www.postman.com/>
+   [application]: <https://play.google.com/store/apps/details?id=com.awarself.sentimento>
+   [repository]: <https://github.com/aprashantz/final-year-project-undergrad>
diff --git a/backend-flask/.flaskenv b/backend-flask/.flaskenv
@@ -0,0 +1,5 @@
+export FLASK_ENV=development
+export FLASK_APP=src
+export SQLALCHEMY_DB_URI=sqlite:///sentimento.db
+
+export JWT_SECRET_KEY='assign your secret key here'
diff --git a/backend-flask/requirements.txt b/backend-flask/requirements.txt
@@ -0,0 +1,62 @@
+astroid==2.4.2
+autopep8==1.6.0
+cachetools==4.2.4
+certifi==2021.10.8
+charset-normalizer==2.0.9
+click==8.0.3
+colorama==0.4.4
+distlib==0.3.4
+filelock==3.4.2
+Flask==2.0.2
+Flask-JWT-Extended==4.3.1
+Flask-SQLAlchemy==2.5.1
+google-api-core==2.3.2
+google-api-python-client==2.33.0
+google-auth==2.3.3
+google-auth-httplib2==0.1.0
+googleapis-common-protos==1.54.0
+greenlet==1.1.2
+gunicorn==20.1.0
+httplib2==0.20.2
+idna==3.3
+isort==4.3.21
+itsdangerous==2.0.1
+Jinja2==3.0.3
+joblib==1.1.0
+lazy-object-proxy==1.4.3
+MarkupSafe==2.0.1
+mccabe==0.6.1
+nltk==3.6.5
+numpy==1.19.1
+oauthlib==3.1.1
+pandas==1.3.4
+platformdirs==2.4.1
+protobuf==3.19.1
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycodestyle==2.8.0
+PyJWT==2.3.0
+pylint==2.5.3
+pyparsing==3.0.6
+python-dateutil==2.8.2
+python-dotenv==0.19.2
+pytz==2021.3
+regex==2021.11.10
+requests==2.26.0
+requests-oauthlib==1.3.0
+rsa==4.8
+scikit-learn==1.0.1
+scipy==1.7.3
+six==1.15.0
+sklearn==0.0
+SQLAlchemy==1.4.29
+textblob==0.17.1
+threadpoolctl==3.0.0
+toml==0.10.2
+tqdm==4.62.3
+tweepy==4.4.0
+uritemplate==4.1.1
+urllib3==1.26.7
+virtualenv==20.13.0
+Werkzeug==2.0.2
+wrapt==1.12.1
diff --git a/backend-flask/src/__init__.py b/backend-flask/src/__init__.py
@@ -0,0 +1,59 @@
+# backend in monolithic architecture
+# middleware file
+
+from flask import Flask
+import os
+from flask_jwt_extended import JWTManager
+from src.register import register_blueprint
+from src.login import login_blueprint
+from src.report import report_blueprint
+from src.profile import profile_blueprint
+from src.vacancy import vacancy_blueprint
+from src.youtube import youtube_blueprint
+from src.twitter import twitter_blueprint
+from src.manual import manual_blueprint
+from src.database import db
+
+
+def create_app(test_config=None):
+    app = Flask(__name__, instance_relative_config=True)
+
+    if test_config is None:
+        app.config.from_mapping(
+            SECRET_KEY=os.environ.get("SECRET_KEY"),
+            SQLALCHEMY_DATABASE_URI=os.environ.get("SQLALCHEMY_DB_URI"),
+            SQLALCHEMY_TRACK_MODIFICATIONS=False,
+            JWT_SECRET_KEY=os.environ.get('JWT_SECRET_KEY')
+        )
+
+    else:
+        app.config.from_mapping(test_config)
+
+    app.register_blueprint(register_blueprint)
+    app.register_blueprint(login_blueprint)
+    app.register_blueprint(report_blueprint)
+    app.register_blueprint(profile_blueprint)
+    app.register_blueprint(vacancy_blueprint)
+    app.register_blueprint(youtube_blueprint)
+    app.register_blueprint(twitter_blueprint)
+    app.register_blueprint(manual_blueprint)
+
+    # decorator for index page of the backend
+    @app.route("/")
+    def index():
+        return {
+            "Registration": "/register/ [POST]",
+            "Login": "/login/ [POST]",
+            "User profile": "/profile/ [GET]",
+            "YouTube comments analysis": "/red/ [POST]",
+            "Tweets analysis": "/blue/ [POST]",
+            "User reports": "/report/ [GET/POST]",
+            "Vacancy": "/vacancy/ [GET/POST] & /vacancy/all?filter=freelancer or vacancy [GET]",
+            "Sentence/Paragraph polarity": "/manual/ [POST]"}
+
+    db.app = app
+    db.init_app(app)
+
+    JWTManager(app)
+
+    return app
diff --git a/backend-flask/src/algo/MultinomialNaiveBayes.py b/backend-flask/src/algo/MultinomialNaiveBayes.py
@@ -0,0 +1,76 @@
+import joblib
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+import pandas as pd
+from src.algo.data_preprocessor import text_cleaner
+
+
+# for production level, we need shorter processing time as much as possible
+# so, making joblib's pickle file of the training model instead of performing calculation every time
+def production_multinomial(testing_data, layer):
+    # Deserializing CV's pkl file to object in runtime env
+    pickled_count_vectorizer = CountVectorizer()
+    if (layer == "sarcasm"):
+        pickled_count_vectorizer = joblib.load(
+            'src/algo/sarcasmpickle_countvectorizer.pkl')
+    if (layer == "spam"):
+        pickled_count_vectorizer = joblib.load(
+            'src/algo/spampickle_countvectorizer.pkl')
+    X_test = pickled_count_vectorizer.transform(testing_data)
+
+    # decerializing MN's pkl file to object in runtime env
+    pickled_multinomial_nv = MultinomialNB()
+    if (layer == "sarcasm"):
+        pickled_multinomial_nv = joblib.load(
+            'src/algo/sarcasmpickle_multinomial.pkl')
+    if (layer == "spam"):
+        pickled_multinomial_nv = joblib.load(
+            'src/algo/spampickle_multinomial.pkl')
+    prediction_of_each_data = pickled_multinomial_nv.predict(
+        X_test).tolist()  # converted numpyarray to list
+    # returns list of 1 or 0 items where 1 for yes and 0 for no
+    return prediction_of_each_data
+
+
+# this debug function is needed to update our training model if new data are added to Sentimento's training datasets
+# this function will perform count vectoization calculation for training data too which takes longer time than using pickled data
+def debug_multinomial(testing_data, layer):
+    training_data = None
+    preprocessed_training_data = []
+    training_label = []
+    if (layer == "spam"):
+        training_data = pd.read_csv('src.algo.spam_training.csv').values
+        for each in training_data:
+            preprocessed_training_data.append(text_cleaner(each[3]))
+            training_label.append(each[4])
+    if (layer == "sarcasm"):
+        training_data = pd.read_csv('src.algo.sarcasm_training.csv').values
+        for each in training_data:
+            preprocessed_training_data.append(text_cleaner(each[0]))
+            training_label.append(each[1])
+
+    # now count vectorizing part
+    cv = CountVectorizer(ngram_range=(1, 2))
+    X_train = cv.fit_transform(preprocessed_training_data)
+
+    # serialization
+    # Save the model as a pickle in a file
+    # unccoment this parts to dump in pickle file for production use
+    if(layer == "spam"):
+        joblib.dump(cv, 'spampickle_countvectorizer.pkl')
+    if (layer == "sarcasm"):
+        joblib.dump(cv, 'sarcasmpickle_countvectorizer.pkl')
+
+    X_test = cv.transform(testing_data)
+    mn = MultinomialNB()
+    mn.fit(X_train, training_label)
+
+    # serialization
+    # unccoment this parts to dump in pickle file for production use
+    if(layer == "spam"):
+        joblib.dump(mn, 'spampickle_multinomial.pkl')
+    if (layer == "sarcasm"):
+        joblib.dump(mn, 'sarcasmpickle_multinomial.pkl')
+
+    prediction_of_each_data = mn.predict(X_test).tolist()
+    return prediction_of_each_data
diff --git a/backend-flask/src/algo/Sentimento.py b/backend-flask/src/algo/Sentimento.py
@@ -0,0 +1,61 @@
+from src.algo.MultinomialNaiveBayes import production_multinomial
+from src.algo.data_preprocessor import get_clean_texts
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+
+
+class Sentimento:
+
+    # instance attribute, constructor
+    def __init__(self, testing_data):
+        self.testing_data = testing_data
+        self.preprocessed_testing_data = get_clean_texts(testing_data)
+        self.spam_detected_list = production_multinomial(
+            self.preprocessed_testing_data, layer="spam")
+        self.sarcasm_detected_list = production_multinomial(
+            self.preprocessed_testing_data, layer="sarcasm")
+
+    # to get total data taken to analyse
+    def data_count(self):
+        return len(self.preprocessed_testing_data)
+
+    # to get total sarcasm/spam detected data's
+    def layer_count(self):
+        spam_count = 0
+        sarcasm_count = 0
+        layer_count = []
+        for each in self.spam_detected_list:
+            if each == 1:
+                spam_count += 1
+        for each in self.sarcasm_detected_list:
+            if each == 1:
+                sarcasm_count += 1
+        layer_count.append(spam_count)
+        layer_count.append(sarcasm_count)
+        return layer_count  # index 0 for spam count and index 1 for sarcasm count
+
+    # to get dict containing overall polarity, positive/negative count from provided data
+    def overall_polarity(self):
+        # considering -1 to -0.2 as negative, -0.2 to 0.2 as neutral and 0.2 to 1 as positive
+        positive_count = 0
+        negative_count = 0
+        neutral_count = 0
+        overall_polarity = 0.00000
+        sid = SentimentIntensityAnalyzer()
+        polarity_result = {}
+        compound_polarity = 0
+        sum_of_all_polarity = 0
+        for each in self.preprocessed_testing_data:
+            compound_polarity = sid.polarity_scores(each)["compound"]
+            sum_of_all_polarity += compound_polarity
+            if (compound_polarity <= -0.2):
+                negative_count += 1
+            elif (compound_polarity >= 0.2):
+                positive_count += 1
+            else:
+                neutral_count += 1
+        overall_polarity = (sum_of_all_polarity/self.data_count())
+        polarity_result["positive_count"] = positive_count
+        polarity_result["negative_count"] = negative_count
+        polarity_result["neutral_count"] = neutral_count
+        polarity_result["overall_polarity"] = overall_polarity
+        return polarity_result
diff --git a/backend-flask/src/algo/data_preprocessor.py b/backend-flask/src/algo/data_preprocessor.py
@@ -0,0 +1,31 @@
+from src.algo.stopwords import customStopWords
+from nltk.tokenize import RegexpTokenizer
+from nltk.stem.porter import PorterStemmer
+
+
+# function that takes list of raw texts and returns list of clean texts
+def get_clean_texts(list_of_texts):
+    list_of_clean_texts = []
+    for each in list_of_texts:
+        cleaned_text = text_cleaner(each)
+        if (cleaned_text != ''):
+            list_of_clean_texts.append(cleaned_text)
+    return list_of_clean_texts
+
+
+# function that takes a raw text and return clean text
+def text_cleaner(text):
+    tokenizer = RegexpTokenizer(r'\w+')
+    ps = PorterStemmer()
+    tokenized_text = tokenizer.tokenize(text.lower())
+    clean_tokenized_text = []  # will add words after filtering stopwords
+    for each_token in tokenized_text:
+        if each_token not in customStopWords():
+            # to remove stopword tokens
+            clean_tokenized_text.append(each_token)
+    stemmed_text = []
+    for token in clean_tokenized_text:
+        # appending the stemmed words in stemmed data
+        stemmed_text.append(ps.stem(token))
+    clean_data = " ".join(stemmed_text)  # changing tokens into one sentence
+    return clean_data