Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
hailiang-wang committed Jan 10, 2024
0 parents commit 10dd16b
Show file tree
Hide file tree
Showing 10 changed files with 368 additions and 0 deletions.
42 changes: 42 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
.vscode
*.swp
*.swo
*.sublime-*
*.pyc
jmeter.log
__pycache__
tmp/
package-lock.json
node_modules/
sftp-config.json
.DS_Store
*.iml
*.ipr
*.iws
*.idea
~$*.xls*
~$*.ppt*
~$*.doc*
~*.tmp
nohup.out

CMakeLists.txt.user
CMakeCache.txt
CMakeFiles
CMakeScripts
Testing
Makefile
build
build-*
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps
.env
.fid
_build

dist/
.*~
*~
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Emotional First Aid Raw Dataset
心理咨询问答原始语料库,仅限研究用途。

心理咨询问答原始语料库(以下也称为“本数据集”,“本语料库”)是为应用人工智能技术于心理咨询领域制作的语料。

该语料是[心理咨询问答语料库(Emotional First Aid Dataset,efaqa-corpus-zh)](https://github.com/chatopera/efaqa-corpus-zh)的来源:`心理咨询问答语料库`是在`心理咨询问答原始语料库`的基础上人工标记的结果,仅针对了部分语料完成了标记。所以,`本语料库`是在开放网络下爬取的未标注的语料集,。

<!-- 添加数据,语料条数,对话数 -->

Empty file added efaqa_corpus_raw/__init__.py
Empty file.
103 changes: 103 additions & 0 deletions efaqa_corpus_raw/exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#===============================================================================
#
# Copyright (c) 2020 <> All Rights Reserved
#
#
# File: /c/Users/Administrator/chatopera/efaqa-corpus-raw/efaqa_corpus_raw/exporter.py
# Author: Hai Liang Wang
# Date: 2024-01-06:09:28:07
#
#===============================================================================

"""
"""
__copyright__ = "Copyright (c) 2020 . All Rights Reserved"
__author__ = "Hai Liang Wang"
__date__ = "2024-01-06:09:28:07"

import os, sys
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(curdir)

if sys.version_info[0] < 3:
raise RuntimeError("Must be using Python 3")
else:
unicode = str

# Get ENV
ENVIRON = os.environ.copy()
import json
import datetime
from pymongo import MongoClient

# Provide the connection details
hostname = '192.168.2.219'
port = 3038 # Default MongoDB port
username = None # If authentication is required
password = None # If authentication is required

# Create a MongoClient instance
mongodb = MongoClient(hostname, port, username=username, password=password)
db = mongodb['geili']
# Access a collection (similar to a table in relational databases)
collection = db['questions']

##########################################################################
# Testcases
##########################################################################
import unittest

# run testcase: python /c/Users/Administrator/chatopera/efaqa-corpus-raw/efaqa_corpus_raw/exporter.py Test.testExample
class Test(unittest.TestCase):
'''
'''
def setUp(self):
pass

def tearDown(self):
pass

def test_export_raw(self):
print("test_export_raw")
print("total qa ", collection.count_documents({}))
output_file = os.path.join(curdir, os.pardir, "tmp", "001.export_file.utf8")
output_lines = []
ct = 0
for doc in collection.find({}):
try:
if "title" in doc and doc["title"] and "chats" in doc and len(doc["chats"]) > 0:
ct = ct + 1
# print("Ct", ct, doc["title"])
del doc["id"]
doc["id"] = str(doc["_id"])
del doc["_id"]
if "crawldate" in doc:
doc["crawldate"] = datetime.datetime.strftime(doc["crawldate"],'%Y-%m-%d %H:%M:%S')
output_lines.append(json.dumps(doc, ensure_ascii=False))
except BaseException as e:
print(e)
print(doc)
sys.exit(1)

with open(output_file, "w", encoding="utf-8") as fout:
for x in output_lines:
fout.write(x.strip() + "\n")

print("dumped lines ", len(output_lines))

def test():
suite = unittest.TestSuite()
suite.addTest(Test("test_export_raw"))
runner = unittest.TextTestRunner()
runner.run(suite)

def main():
test()

if __name__ == '__main__':
main()
mongodb.close()
153 changes: 153 additions & 0 deletions efaqa_corpus_raw/formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#===============================================================================
#
# Copyright (c) 2020 <> All Rights Reserved
#
#
# File: /c/Users/Administrator/chatopera/efaqa-corpus-raw/efaqa_corpus_raw/formatter.py
# Author: Hai Liang Wang
# Date: 2024-01-07:09:05:27
#
#===============================================================================

"""
"""
__copyright__ = "Copyright (c) 2020 . All Rights Reserved"
__author__ = "Hai Liang Wang"
__date__ = "2024-01-07:09:05:27"

import os, sys
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(curdir)

if sys.version_info[0] < 3:
raise RuntimeError("Must be using Python 3")
else:
unicode = str

import json

# Get ENV
ENVIRON = os.environ.copy()


def resolve_all_senders_info(doc):
'''
所有发送者信息
'''
all_avatars = set()

for x in doc["chats"]:
try:
if x["sender"] == "audience": all_avatars.add(x["avatar"])
except BaseException as e:
if x["sender"] == "audience":
all_avatars.add(x["userspace"])
# print(e)
# print(x)
# sys.exit(1)

senders_info = dict()

ct = 1
for x in all_avatars:
senders_info[x] = "Audience" + str(ct)
ct = ct + 1

return senders_info


def parse_chats(doc):
'''
分析 Chats 数据
'''
originals = doc["chats"]

chats = []

senders_info = resolve_all_senders_info(doc)

for x in originals:

if "type" in x and x["type"] == "imageMessage": continue

if x["sender"] == "audience":
x["name"] = senders_info[x["avatar"] if "avatar" in x else x["userspace"]]

if "userspace" in x: del x["userspace"]
if "avatar" in x: del x["avatar"]
if "type" in x: del x["type"]

chats.append(x)

# print("parsed chats", len(chats))

return chats


##########################################################################
# Testcases
##########################################################################
import unittest

# run testcase: python /c/Users/Administrator/chatopera/efaqa-corpus-raw/efaqa_corpus_raw/formatter.py Test.testExample
class Test(unittest.TestCase):
'''
'''
def setUp(self):
pass

def tearDown(self):
pass

def test_format(self):
print("test_format")
input_file = os.path.join(curdir, os.pardir, "tmp", "001.export_file.utf8")
input_lines = []

with open(input_file, "r", encoding="utf-8") as fin:
for x in fin.readlines():
input_lines.append(json.loads(x))

print("loads docs", len(input_lines))

output_lines = []
output_file = os.path.join(curdir, os.pardir, "tmp", "002.format_file.utf8")
# 文件脱敏
for x in input_lines:
if "avatar" in x:
del x["avatar"]
if "url" in x:
del x["url"]
del x["project"]
del x["spider"]
del x["server"]
if "crawldate" in x:
del x["crawldate"]

x["chats"] = parse_chats(x)

if len(x["chats"]) == 0: continue

output_lines.append(x)

print("output_lines ", len(output_lines))
with open(output_file, "w", encoding="utf-8") as fout:
for x in output_lines:
fout.write(json.dumps(x, ensure_ascii=False) + "\n")


def test():
suite = unittest.TestSuite()
suite.addTest(Test("test_format"))
runner = unittest.TextTestRunner()
runner.run(suite)

def main():
test()

if __name__ == '__main__':
main()
Empty file added efaqa_corpus_raw/stats.py
Empty file.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pymongo==3.10.1
30 changes: 30 additions & 0 deletions scripts/001.export_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#! /bin/bash
###########################################
#
###########################################

# constants
baseDir=$(cd `dirname "$0"`;pwd)
cwdDir=$PWD
export PYTHONUNBUFFERED=1
export PATH=/opt/miniconda3/envs/venv-py3/bin:$PATH
export TS=$(date +%Y%m%d%H%M%S)
export DATE=`date "+%Y%m%d"`
export DATE_WITH_TIME=`date "+%Y%m%d-%H%M%S"` #add %3N as we want millisecond too

# functions

# main
[ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return
cd $baseDir/..

if [ -f .env ]; then
source .env
fi

if [ ! -d tmp ]; then
mkdir tmp
fi

cd $baseDir/../efaqa_corpus_raw
python exporter.py
30 changes: 30 additions & 0 deletions scripts/002.format_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#! /bin/bash
###########################################
#
###########################################

# constants
baseDir=$(cd `dirname "$0"`;pwd)
cwdDir=$PWD
export PYTHONUNBUFFERED=1
export PATH=/opt/miniconda3/envs/venv-py3/bin:$PATH
export TS=$(date +%Y%m%d%H%M%S)
export DATE=`date "+%Y%m%d"`
export DATE_WITH_TIME=`date "+%Y%m%d-%H%M%S"` #add %3N as we want millisecond too

# functions

# main
[ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return
cd $baseDir/..

if [ -f .env ]; then
source .env
fi

if [ ! -d tmp ]; then
mkdir tmp
fi

cd $baseDir/../efaqa_corpus_raw
python formatter.py
Empty file added setup.py
Empty file.

0 comments on commit 10dd16b

Please sign in to comment.