-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 10dd16b
Showing
10 changed files
with
368 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
.vscode | ||
*.swp | ||
*.swo | ||
*.sublime-* | ||
*.pyc | ||
jmeter.log | ||
__pycache__ | ||
tmp/ | ||
package-lock.json | ||
node_modules/ | ||
sftp-config.json | ||
.DS_Store | ||
*.iml | ||
*.ipr | ||
*.iws | ||
*.idea | ||
~$*.xls* | ||
~$*.ppt* | ||
~$*.doc* | ||
~*.tmp | ||
nohup.out | ||
|
||
CMakeLists.txt.user | ||
CMakeCache.txt | ||
CMakeFiles | ||
CMakeScripts | ||
Testing | ||
Makefile | ||
build | ||
build-* | ||
cmake_install.cmake | ||
install_manifest.txt | ||
compile_commands.json | ||
CTestTestfile.cmake | ||
_deps | ||
.env | ||
.fid | ||
_build | ||
|
||
dist/ | ||
.*~ | ||
*~ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# Emotional First Aid Raw Dataset | ||
心理咨询问答原始语料库,仅限研究用途。 | ||
|
||
心理咨询问答原始语料库(以下也称为“本数据集”,“本语料库”)是为应用人工智能技术于心理咨询领域制作的语料。 | ||
|
||
该语料是[心理咨询问答语料库(Emotional First Aid Dataset,efaqa-corpus-zh)](https://github.com/chatopera/efaqa-corpus-zh)的来源:`心理咨询问答语料库`是在`心理咨询问答原始语料库`的基础上人工标记的结果,仅针对了部分语料完成了标记。所以,`本语料库`是在开放网络下爬取的未标注的语料集,。 | ||
|
||
<!-- 添加数据,语料条数,对话数 --> | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
#=============================================================================== | ||
# | ||
# Copyright (c) 2020 <> All Rights Reserved | ||
# | ||
# | ||
# File: /c/Users/Administrator/chatopera/efaqa-corpus-raw/efaqa_corpus_raw/exporter.py | ||
# Author: Hai Liang Wang | ||
# Date: 2024-01-06:09:28:07 | ||
# | ||
#=============================================================================== | ||
|
||
""" | ||
""" | ||
__copyright__ = "Copyright (c) 2020 . All Rights Reserved" | ||
__author__ = "Hai Liang Wang" | ||
__date__ = "2024-01-06:09:28:07" | ||
|
||
import os, sys | ||
curdir = os.path.dirname(os.path.abspath(__file__)) | ||
sys.path.append(curdir) | ||
|
||
if sys.version_info[0] < 3: | ||
raise RuntimeError("Must be using Python 3") | ||
else: | ||
unicode = str | ||
|
||
# Get ENV | ||
ENVIRON = os.environ.copy() | ||
import json | ||
import datetime | ||
from pymongo import MongoClient | ||
|
||
# Provide the connection details | ||
hostname = '192.168.2.219' | ||
port = 3038 # Default MongoDB port | ||
username = None # If authentication is required | ||
password = None # If authentication is required | ||
|
||
# Create a MongoClient instance | ||
mongodb = MongoClient(hostname, port, username=username, password=password) | ||
db = mongodb['geili'] | ||
# Access a collection (similar to a table in relational databases) | ||
collection = db['questions'] | ||
|
||
########################################################################## | ||
# Testcases | ||
########################################################################## | ||
import unittest | ||
|
||
# run testcase: python /c/Users/Administrator/chatopera/efaqa-corpus-raw/efaqa_corpus_raw/exporter.py Test.testExample | ||
class Test(unittest.TestCase): | ||
''' | ||
''' | ||
def setUp(self): | ||
pass | ||
|
||
def tearDown(self): | ||
pass | ||
|
||
def test_export_raw(self): | ||
print("test_export_raw") | ||
print("total qa ", collection.count_documents({})) | ||
output_file = os.path.join(curdir, os.pardir, "tmp", "001.export_file.utf8") | ||
output_lines = [] | ||
ct = 0 | ||
for doc in collection.find({}): | ||
try: | ||
if "title" in doc and doc["title"] and "chats" in doc and len(doc["chats"]) > 0: | ||
ct = ct + 1 | ||
# print("Ct", ct, doc["title"]) | ||
del doc["id"] | ||
doc["id"] = str(doc["_id"]) | ||
del doc["_id"] | ||
if "crawldate" in doc: | ||
doc["crawldate"] = datetime.datetime.strftime(doc["crawldate"],'%Y-%m-%d %H:%M:%S') | ||
output_lines.append(json.dumps(doc, ensure_ascii=False)) | ||
except BaseException as e: | ||
print(e) | ||
print(doc) | ||
sys.exit(1) | ||
|
||
with open(output_file, "w", encoding="utf-8") as fout: | ||
for x in output_lines: | ||
fout.write(x.strip() + "\n") | ||
|
||
print("dumped lines ", len(output_lines)) | ||
|
||
def test(): | ||
suite = unittest.TestSuite() | ||
suite.addTest(Test("test_export_raw")) | ||
runner = unittest.TextTestRunner() | ||
runner.run(suite) | ||
|
||
def main(): | ||
test() | ||
|
||
if __name__ == '__main__': | ||
main() | ||
mongodb.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
#=============================================================================== | ||
# | ||
# Copyright (c) 2020 <> All Rights Reserved | ||
# | ||
# | ||
# File: /c/Users/Administrator/chatopera/efaqa-corpus-raw/efaqa_corpus_raw/formatter.py | ||
# Author: Hai Liang Wang | ||
# Date: 2024-01-07:09:05:27 | ||
# | ||
#=============================================================================== | ||
|
||
""" | ||
""" | ||
__copyright__ = "Copyright (c) 2020 . All Rights Reserved" | ||
__author__ = "Hai Liang Wang" | ||
__date__ = "2024-01-07:09:05:27" | ||
|
||
import os, sys | ||
curdir = os.path.dirname(os.path.abspath(__file__)) | ||
sys.path.append(curdir) | ||
|
||
if sys.version_info[0] < 3: | ||
raise RuntimeError("Must be using Python 3") | ||
else: | ||
unicode = str | ||
|
||
import json | ||
|
||
# Get ENV | ||
ENVIRON = os.environ.copy() | ||
|
||
|
||
def resolve_all_senders_info(doc): | ||
''' | ||
所有发送者信息 | ||
''' | ||
all_avatars = set() | ||
|
||
for x in doc["chats"]: | ||
try: | ||
if x["sender"] == "audience": all_avatars.add(x["avatar"]) | ||
except BaseException as e: | ||
if x["sender"] == "audience": | ||
all_avatars.add(x["userspace"]) | ||
# print(e) | ||
# print(x) | ||
# sys.exit(1) | ||
|
||
senders_info = dict() | ||
|
||
ct = 1 | ||
for x in all_avatars: | ||
senders_info[x] = "Audience" + str(ct) | ||
ct = ct + 1 | ||
|
||
return senders_info | ||
|
||
|
||
def parse_chats(doc): | ||
''' | ||
分析 Chats 数据 | ||
''' | ||
originals = doc["chats"] | ||
|
||
chats = [] | ||
|
||
senders_info = resolve_all_senders_info(doc) | ||
|
||
for x in originals: | ||
|
||
if "type" in x and x["type"] == "imageMessage": continue | ||
|
||
if x["sender"] == "audience": | ||
x["name"] = senders_info[x["avatar"] if "avatar" in x else x["userspace"]] | ||
|
||
if "userspace" in x: del x["userspace"] | ||
if "avatar" in x: del x["avatar"] | ||
if "type" in x: del x["type"] | ||
|
||
chats.append(x) | ||
|
||
# print("parsed chats", len(chats)) | ||
|
||
return chats | ||
|
||
|
||
########################################################################## | ||
# Testcases | ||
########################################################################## | ||
import unittest | ||
|
||
# run testcase: python /c/Users/Administrator/chatopera/efaqa-corpus-raw/efaqa_corpus_raw/formatter.py Test.testExample | ||
class Test(unittest.TestCase): | ||
''' | ||
''' | ||
def setUp(self): | ||
pass | ||
|
||
def tearDown(self): | ||
pass | ||
|
||
def test_format(self): | ||
print("test_format") | ||
input_file = os.path.join(curdir, os.pardir, "tmp", "001.export_file.utf8") | ||
input_lines = [] | ||
|
||
with open(input_file, "r", encoding="utf-8") as fin: | ||
for x in fin.readlines(): | ||
input_lines.append(json.loads(x)) | ||
|
||
print("loads docs", len(input_lines)) | ||
|
||
output_lines = [] | ||
output_file = os.path.join(curdir, os.pardir, "tmp", "002.format_file.utf8") | ||
# 文件脱敏 | ||
for x in input_lines: | ||
if "avatar" in x: | ||
del x["avatar"] | ||
if "url" in x: | ||
del x["url"] | ||
del x["project"] | ||
del x["spider"] | ||
del x["server"] | ||
if "crawldate" in x: | ||
del x["crawldate"] | ||
|
||
x["chats"] = parse_chats(x) | ||
|
||
if len(x["chats"]) == 0: continue | ||
|
||
output_lines.append(x) | ||
|
||
print("output_lines ", len(output_lines)) | ||
with open(output_file, "w", encoding="utf-8") as fout: | ||
for x in output_lines: | ||
fout.write(json.dumps(x, ensure_ascii=False) + "\n") | ||
|
||
|
||
def test(): | ||
suite = unittest.TestSuite() | ||
suite.addTest(Test("test_format")) | ||
runner = unittest.TextTestRunner() | ||
runner.run(suite) | ||
|
||
def main(): | ||
test() | ||
|
||
if __name__ == '__main__': | ||
main() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pymongo==3.10.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#! /bin/bash | ||
########################################### | ||
# | ||
########################################### | ||
|
||
# constants | ||
baseDir=$(cd `dirname "$0"`;pwd) | ||
cwdDir=$PWD | ||
export PYTHONUNBUFFERED=1 | ||
export PATH=/opt/miniconda3/envs/venv-py3/bin:$PATH | ||
export TS=$(date +%Y%m%d%H%M%S) | ||
export DATE=`date "+%Y%m%d"` | ||
export DATE_WITH_TIME=`date "+%Y%m%d-%H%M%S"` #add %3N as we want millisecond too | ||
|
||
# functions | ||
|
||
# main | ||
[ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return | ||
cd $baseDir/.. | ||
|
||
if [ -f .env ]; then | ||
source .env | ||
fi | ||
|
||
if [ ! -d tmp ]; then | ||
mkdir tmp | ||
fi | ||
|
||
cd $baseDir/../efaqa_corpus_raw | ||
python exporter.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#! /bin/bash | ||
########################################### | ||
# | ||
########################################### | ||
|
||
# constants | ||
baseDir=$(cd `dirname "$0"`;pwd) | ||
cwdDir=$PWD | ||
export PYTHONUNBUFFERED=1 | ||
export PATH=/opt/miniconda3/envs/venv-py3/bin:$PATH | ||
export TS=$(date +%Y%m%d%H%M%S) | ||
export DATE=`date "+%Y%m%d"` | ||
export DATE_WITH_TIME=`date "+%Y%m%d-%H%M%S"` #add %3N as we want millisecond too | ||
|
||
# functions | ||
|
||
# main | ||
[ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return | ||
cd $baseDir/.. | ||
|
||
if [ -f .env ]; then | ||
source .env | ||
fi | ||
|
||
if [ ! -d tmp ]; then | ||
mkdir tmp | ||
fi | ||
|
||
cd $baseDir/../efaqa_corpus_raw | ||
python formatter.py |
Empty file.