-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_descript.py
123 lines (109 loc) · 6.47 KB
/
generate_descript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import numpy as np
import pdb
import pandas as pd
from openai import OpenAI
import openai
from time import sleep
from tqdm import tqdm
dataset_list = [
'london_smart_meters_with_missing', # Energy - 30T - 713 - 9,543,348
'bdg-2_bear', # Energy - H - 91 - 1,482,312
'bdg-2_fox', # Energy - H - 135 - 2,324,568
'era5_1989', # Climate - H - 8192 - 71565312
'era5_1990', # Climate - H - 8192 - 71565312
'era5_1991', # Climate - H - 8192 - 71565312
'cmip6_2005', # Climate - 6H - 8192 - 59801600
'cmip6_2010', # Climate - 6H - 8192 - 59801600
'uber_tlc_daily', # Transport - D - 262 - 47,087
'SZ_TAXI', # Transport - 15T - 156 - 464,256
'PEMS03', # Transport - 5T - 358 - 9382464
'weather', # weather - D - 3010 - 12717250
'oikolab_weather', # weather - H - 8 - 800456
'hospital', # Healthcare - M - 767 - 55224
'kaggle_web_traffic_weekly', # web - W-SUN - 145063 - 16537182
'tourism_yearly', # eco - A-DEC - 419 - 14665
'tourism_monthly', # eco - M - 366 - 59658
]
dataset_descriptions = {
'london_smart_meters_with_missing': 'This dataset contains hourly electricity consumption data from smart meters installed in London households, with some missing entries (Sampling rate: 30 minutes).',
'bdg-2_bear': 'This dataset provides hourly energy consumption data from a building named "Bear" for studying energy patterns and efficiency (Sampling rate: Hourly).',
'bdg-2_fox': 'Similar to the "Bear" dataset, this dataset offers hourly energy consumption data from a building named "Fox" for energy usage analysis (Sampling rate: Hourly).',
'era5_1989': 'The ERA5 dataset contains hourly global climate reanalysis data for the year 1989, including temperature, wind speed, and precipitation (Sampling rate: Hourly).',
'era5_1990': 'A continuation of the ERA5 dataset, this collection provides hourly global climate reanalysis data for the year 1990 (Sampling rate: Hourly).',
'era5_1991': 'Another segment of the ERA5 dataset, offering hourly climate data for the year 1991 for weather and climate studies (Sampling rate: Hourly).',
'cmip6_2005': 'The CMIP6 dataset provides climate model output data for the year 2005, collected every 6 hours, used for studying climate change projections (Sampling rate: Every 6 hours).',
'cmip6_2010': 'Similar to the previous entry, this dataset contains climate model data for the year 2010, also sampled every 6 hours (Sampling rate: Every 6 hours).',
'uber_tlc_daily': 'This dataset comprises daily trip data from Uber and other taxi companies in New York City, useful for analyzing transport trends and demand (Sampling rate: Daily).',
'SZ_TAXI': 'The SZ_TAXI dataset records taxi trips in Shenzhen, China, collected every 15 minutes, helpful for traffic analysis and prediction (Sampling rate: Every 15 minutes).',
'PEMS03': 'This dataset contains traffic flow data collected from highway sensors in California, sampled every 5 minutes, for monitoring and forecasting traffic conditions (Sampling rate: Every 5 minutes).',
'weather': 'The "weather" dataset offers daily aggregated weather data, including temperature, precipitation, and other meteorological variables (Sampling rate: Daily).',
'oikolab_weather': 'This dataset provides detailed hourly weather data, useful for applications like agriculture, energy management, and disaster preparedness (Sampling rate: Hourly).',
'hospital': 'The "hospital" dataset includes minute-by-minute healthcare-related data, potentially involving patient monitoring or resource usage (Sampling rate: Every minute).',
'kaggle_web_traffic_weekly': 'This dataset captures weekly web traffic data, suitable for analyzing trends in online activity and user behavior (Sampling rate: Weekly).',
'tourism_yearly': 'The "tourism_yearly" dataset provides annual data on tourism metrics like visitor numbers and revenue, useful for economic and business analysis (Sampling rate: Annual).',
'tourism_monthly': 'This dataset offers monthly tourism data, focusing on metrics like arrivals and hotel stays, enabling trend analysis in the tourism industry (Sampling rate: Monthly).'
}
def openai_api(messages, model, api_key, temperature=1, top_p=1, stop=None):
got_result = False
using_together = 'Llama' in model or 'Qwen' in model
if using_together:
client = openai.OpenAI(
api_key=open("together_key.txt").read().strip(),
base_url="https://api.together.xyz/v1",
)
else:
client = OpenAI(api_key=api_key)
trial = 0
while not got_result and trial <= 10:
try:
stream = client.chat.completions.create(
model=model,
messages=messages,
stream=True,
max_tokens=2048,
temperature=temperature, top_p=top_p, stop=stop)
message = ""
for chunk in stream:
# print(chunk.choices[0].delta.content or "", end="", flush=True)
if chunk.choices[0].delta.content is not None:
message += chunk.choices[0].delta.content
got_result = True
trial += 1
except Exception:
sleep(3)
return message
# Function to generate rephrased descriptions using a loop
def generate_rephrased_descriptions(dataset_descriptions, api_key, model="gpt-4o-mini"):
variation = 50
rephrased_descriptions = {}
for dataset, description in dataset_descriptions.items():
rephrased_versions = []
print(dataset)
for i in tqdm(range(variation)): # Generate variation rephrased descriptions
messages = [{"role": "user", "content": f"Rephrase the following description: {description}"}]
response = openai_api(messages, model, api_key)
print(response.strip())
rephrased_versions.append(response.strip()) # Strip to remove any leading/trailing whitespace
rephrased_descriptions[dataset] = rephrased_versions
return rephrased_descriptions
if __name__ == '__main__':
openai_key = open("key.txt").read().strip()
openai.api_key = openai_key
messages = messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Rephrase the sentence: This dataset contains hourly electricity consumption data from smart meters installed in London households, with some missing entries (Sampling rate: Hourly)."
}
]
}
]
m = openai_api(messages, model='gpt-4o-mini', api_key=openai_key)
print(m)
# Generate rephrased descriptions
rephrased_descriptions = generate_rephrased_descriptions(dataset_descriptions, api_key=openai_key)
# Convert to DataFrame and save as CSV
df_rephrased = pd.DataFrame.from_dict(rephrased_descriptions, orient='index').transpose()
df_rephrased.to_csv('rephrased_descriptions.csv', index=False)