Skip to content

Commit

Permalink
Merge pull request #19 from SSU-Plector/issue/18-remove-torch
Browse files Browse the repository at this point in the history
♻️ [Refactor] Torch 설치 삭제 후 sentence transformer 모델 도입
  • Loading branch information
chanmin-00 authored Jul 20, 2024
2 parents e1c7be1 + e278c54 commit 2dd34ed
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 29 deletions.
1 change: 0 additions & 1 deletion .ebextensions/01-makerFiles.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ files:

# Install dependencies (if necessary, based on your setup)
pip install -r requirements.txt
pip install torch==2.1.0+cpu torchvision==0.16.0+cpu torchaudio==2.1.0+cpu -f https://download.pytorch.org/whl/torch_stable.html

# Run the Flask application using Gunicorn
exec gunicorn -b 0.0.0.0:8000 application:application
Binary file modified requirements.txt
Binary file not shown.
28 changes: 14 additions & 14 deletions src/service/ai.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from flask import abort
from sklearn.feature_extraction.text import TfidfVectorizer

from sentence_transformers import SentenceTransformer, util
from src.enum.part import Part
from src.service.database import developer_part_eq
from src.service.nlp.embedding import get_bert_embeddings
from src.service.nlp.similarity import max_similarity, cal_similarity

# SentenceTransformer 모델 로드
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


# 개발자 매칭 함수
Expand All @@ -23,21 +23,21 @@ def developer_matching(data):
if query is None:
query = ''

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['short_intro'].tolist())
query_tfidf = tfidf_vectorizer.transform([query])
tfidf_similarities = cal_similarity(tfidf_matrix, query_tfidf)
query_embedding = model.encode(query, convert_to_tensor=True) # 쿼리 문장의 임베딩 생성

short_intros = df['short_intro'].tolist()
embeddings = model.encode(short_intros, convert_to_tensor=True) # 데이터프레임의 각 문장에 대한 임베딩 생성

similarities = util.cos_sim(query_embedding, embeddings)[0] # 문장 간의 유사도 계산

developer_embeddings = get_bert_embeddings(df['short_intro'].tolist())
query_embedding = get_bert_embeddings([query])
embedding_similarities = cal_similarity(developer_embeddings, query_embedding)
top_k = min(5, len(similarities))
top_results = similarities.topk(k=top_k, largest=True) # 유사도 높은 상위 5개 선택

hybrid_similarities = 0.5 * tfidf_similarities + 0.5 * embedding_similarities
recommended_indices = max_similarity(hybrid_similarities, 5)
recommended_indices = top_results.indices.cpu().numpy()

result_df = df.iloc[recommended_indices].copy()
result_df['developer_id'] = result_df['developer_id'].astype(int)

return {
'developers': [{'developer_id': dev_id} for dev_id in result_df['developer_id']]
}
}
14 changes: 0 additions & 14 deletions src/service/nlp/embedding.py

This file was deleted.

0 comments on commit 2dd34ed

Please sign in to comment.