generate_descript.py

import numpy as np
import pdb
import pandas as pd 
from openai import OpenAI
import openai 
from time import sleep
from tqdm import tqdm

dataset_list = [
        'london_smart_meters_with_missing', # Energy - 30T - 713 - 9,543,348
        'bdg-2_bear', # Energy - H - 91 - 1,482,312
        'bdg-2_fox', # Energy - H - 135 - 2,324,568
        'era5_1989', # Climate - H - 8192 - 71565312
        'era5_1990', # Climate - H - 8192 - 71565312
        'era5_1991', # Climate - H - 8192 - 71565312
        'cmip6_2005', # Climate - 6H - 8192 - 59801600
        'cmip6_2010', # Climate - 6H - 8192 - 59801600
        'uber_tlc_daily', # Transport - D - 262 - 47,087
        'SZ_TAXI', # Transport - 15T - 156 - 464,256
        'PEMS03', # Transport - 5T - 358 - 9382464
        'weather', # weather - D - 3010 - 12717250
        'oikolab_weather', # weather - H - 8 - 800456
        'hospital', # Healthcare - M - 767 - 55224
        'kaggle_web_traffic_weekly', # web - W-SUN - 145063 - 16537182
        'tourism_yearly', # eco - A-DEC - 419 - 14665
        'tourism_monthly', # eco - M - 366 - 59658
    ] 

dataset_descriptions = {
	'london_smart_meters_with_missing': 'This dataset contains hourly electricity consumption data from smart meters installed in London households, with some missing entries (Sampling rate: 30 minutes).',
	'bdg-2_bear': 'This dataset provides hourly energy consumption data from a building named "Bear" for studying energy patterns and efficiency (Sampling rate: Hourly).',
	'bdg-2_fox': 'Similar to the "Bear" dataset, this dataset offers hourly energy consumption data from a building named "Fox" for energy usage analysis (Sampling rate: Hourly).',
	'era5_1989': 'The ERA5 dataset contains hourly global climate reanalysis data for the year 1989, including temperature, wind speed, and precipitation (Sampling rate: Hourly).',
	'era5_1990': 'A continuation of the ERA5 dataset, this collection provides hourly global climate reanalysis data for the year 1990 (Sampling rate: Hourly).',
	'era5_1991': 'Another segment of the ERA5 dataset, offering hourly climate data for the year 1991 for weather and climate studies (Sampling rate: Hourly).',
	'cmip6_2005': 'The CMIP6 dataset provides climate model output data for the year 2005, collected every 6 hours, used for studying climate change projections (Sampling rate: Every 6 hours).',
	'cmip6_2010': 'Similar to the previous entry, this dataset contains climate model data for the year 2010, also sampled every 6 hours (Sampling rate: Every 6 hours).',
	'uber_tlc_daily': 'This dataset comprises daily trip data from Uber and other taxi companies in New York City, useful for analyzing transport trends and demand (Sampling rate: Daily).',
	'SZ_TAXI': 'The SZ_TAXI dataset records taxi trips in Shenzhen, China, collected every 15 minutes, helpful for traffic analysis and prediction (Sampling rate: Every 15 minutes).',
	'PEMS03': 'This dataset contains traffic flow data collected from highway sensors in California, sampled every 5 minutes, for monitoring and forecasting traffic conditions (Sampling rate: Every 5 minutes).',
	'weather': 'The "weather" dataset offers daily aggregated weather data, including temperature, precipitation, and other meteorological variables (Sampling rate: Daily).',
	'oikolab_weather': 'This dataset provides detailed hourly weather data, useful for applications like agriculture, energy management, and disaster preparedness (Sampling rate: Hourly).',
	'hospital': 'The "hospital" dataset includes minute-by-minute healthcare-related data, potentially involving patient monitoring or resource usage (Sampling rate: Every minute).',
	'kaggle_web_traffic_weekly': 'This dataset captures weekly web traffic data, suitable for analyzing trends in online activity and user behavior (Sampling rate: Weekly).',
	'tourism_yearly': 'The "tourism_yearly" dataset provides annual data on tourism metrics like visitor numbers and revenue, useful for economic and business analysis (Sampling rate: Annual).',
	'tourism_monthly': 'This dataset offers monthly tourism data, focusing on metrics like arrivals and hotel stays, enabling trend analysis in the tourism industry (Sampling rate: Monthly).'
}


def openai_api(messages, model, api_key, temperature=1, top_p=1, stop=None):

	got_result = False
	using_together = 'Llama' in model or 'Qwen' in model
	if using_together:
		client = openai.OpenAI(
			api_key=open("together_key.txt").read().strip(),
			base_url="https://api.together.xyz/v1",
			)
	else:
		client = OpenAI(api_key=api_key)
	trial = 0
	while not got_result and trial <= 10:
		try:
			stream = client.chat.completions.create(
				model=model,
				messages=messages,
				stream=True,
				max_tokens=2048, 
				temperature=temperature, top_p=top_p, stop=stop)
			
			message = ""
			for chunk in stream:
				# print(chunk.choices[0].delta.content or "", end="", flush=True)
				if chunk.choices[0].delta.content is not None:
					message += chunk.choices[0].delta.content
			got_result = True
			trial += 1
			
		except Exception:
			sleep(3)

	return message

# Function to generate rephrased descriptions using a loop
def generate_rephrased_descriptions(dataset_descriptions, api_key, model="gpt-4o-mini"):
	variation = 50
	rephrased_descriptions = {}
	for dataset, description in dataset_descriptions.items():
		rephrased_versions = []
		print(dataset)
		for i in tqdm(range(variation)):  # Generate variation rephrased descriptions
			messages = [{"role": "user", "content": f"Rephrase the following description: {description}"}]
			response = openai_api(messages, model, api_key)
			print(response.strip())
			rephrased_versions.append(response.strip())  # Strip to remove any leading/trailing whitespace
		rephrased_descriptions[dataset] = rephrased_versions
	return rephrased_descriptions

if __name__ == '__main__':
	openai_key = open("key.txt").read().strip()
	openai.api_key = openai_key
	messages = messages=[
	{
	  "role": "user",
	  "content": [
		{
		  "type": "text",
		  "text": "Rephrase the sentence: This dataset contains hourly electricity consumption data from smart meters installed in London households, with some missing entries (Sampling rate: Hourly)."
		}
	  ]
	}
	]
	m =  openai_api(messages, model='gpt-4o-mini', api_key=openai_key)
	print(m)

	# Generate rephrased descriptions
	rephrased_descriptions = generate_rephrased_descriptions(dataset_descriptions, api_key=openai_key)
	
	# Convert to DataFrame and save as CSV
	df_rephrased = pd.DataFrame.from_dict(rephrased_descriptions, orient='index').transpose()
	df_rephrased.to_csv('rephrased_descriptions.csv', index=False)