pr ready

boostcampaitech7 · Nov 10, 2024 · f980701 · f980701
1 parent 224e4bc
commit f980701
Show file tree

Hide file tree

Showing 10 changed files with 540 additions and 0 deletions.
diff --git a/src/pipeline1/augmentation/aug_replace_special_space.py b/src/pipeline1/augmentation/aug_replace_special_space.py
@@ -0,0 +1,31 @@
+import pandas as pd
+import re
+
+def replace_special_chars(text):
+    """특수문자를 공백으로 대체하는 함수"""
+    if not isinstance(text, str):
+        return text
+    return re.sub(r"[^\w\s]", " ", text)
+
+def add_cleaned_text_as_new_rows(input_path, output_path, text_column="text"):
+    """특수문자를 공백으로 대체한 텍스트를 새로운 행으로 추가한 CSV 파일 저장"""
+    # 파일 로드
+    df = pd.read_csv(input_path)
+
+    # 특수문자를 공백으로 대체한 텍스트를 새 행으로 추가
+    cleaned_texts = df[text_column].apply(replace_special_chars)
+    cleaned_df = df.copy()
+    cleaned_df[text_column] = cleaned_texts
+
+    # 기존 데이터와 새로운 행을 결합
+    augmented_df = pd.concat([df, cleaned_df], ignore_index=True)
+
+    # 파일 저장
+    augmented_df.to_csv(output_path, index=False)
+    print(f"CSV with cleaned text added as new rows saved to {output_path}")
+
+# 예시 실행
+if __name__ == "__main__":
+    input_path = "./data/preprocessed/preprocessed_train_v3_2.csv"  # 예시 파일 경로
+    output_path = "./data/augmented/preprocessed_train_v4_2.csv"
+    add_cleaned_text_as_new_rows(input_path, output_path)
diff --git a/src/pipeline1/augmentation/back_translation.py b/src/pipeline1/augmentation/back_translation.py
@@ -0,0 +1,50 @@
+import pandas as pd
+import deepl
+from tqdm import tqdm
+
+# DeepL API 인증키 설정
+auth_key = "key"  # 본인의 DeepL API 인증키로 변경하세요
+translator_deepl = deepl.Translator(auth_key)
+
+# DeepL을 사용한 역번역 함수 정의
+def back_translate_deepl(text):
+    try:
+        # Step 1: 한국어 -> 영어 번역
+        en_result = translator_deepl.translate_text(text, target_lang="EN-US")
+        if not en_result.text:
+            return None  # 번역 실패 시 None 반환
+
+        # Step 2: 영어 -> 한국어 역번역
+        ko_result = translator_deepl.translate_text(en_result.text, target_lang="KO")
+        return ko_result.text  # 성공 시 역번역 결과 반환
+
+    except Exception as e:
+        print(f"Error in back-translation for text '{text}': {e}")
+        return None  # 에러 발생 시 None 반환
+
+# CSV 파일을 받아 역번역하여 데이터 증강 수행 함수 정의
+def augment_csv_with_back_translation(input_csv_path, output_csv_path):
+    # CSV 파일 로드
+    df = pd.read_csv(input_csv_path)
+
+    # tqdm을 사용하여 역번역 진행 상태 표시
+    tqdm.pandas()
+
+    # 역번역 수행
+    df['augmented_text'] = df['text'].progress_apply(back_translate_deepl)
+
+    # 역번역된 결과 중 None 값을 제거하고 원본 데이터와 결합
+    augmented_df = df.dropna(subset=['augmented_text']).copy()
+    augmented_df['text'] = augmented_df['augmented_text']  # 역번역 결과를 기존 열 이름으로 맞춤
+    augmented_df = augmented_df.drop(columns=['augmented_text'])  # 보조 열 삭제
+
+    # 원본 데이터와 역번역 데이터 결합
+    final_df = pd.concat([df.drop(columns=['augmented_text']), augmented_df], ignore_index=True)
+
+    # 결과를 CSV로 저장
+    final_df.to_csv(output_csv_path, index=False)
+    print(f"Augmented data saved to {output_csv_path}")
+
+# 예시 파일 경로
+input_csv_path = "./data/preprocessed/unique_ids_engWord_output.csv"       # 입력 CSV 파일 경로
+output_csv_path = "./data/preprocessed/unique_ids_engWord_output_backtransAug.csv"  # 출력 CSV 파일 경로
diff --git a/src/pipeline1/preprocessing/combine_row.py b/src/pipeline1/preprocessing/combine_row.py
@@ -0,0 +1,19 @@
+#행 결합
+
+import pandas as pd
+
+# train_path = "./data/raw/train.csv"  # 예시 파일 경로
+csv1_path = "./data/preprocessed/pipeline1_step4.csv"  # 예시 파일 경로
+csv2_path = "./data/preprocessed/pipeline1_unique_ids_output.csv"
+
+# 두 CSV 파일 읽기
+df1 = pd.read_csv(csv1_path)
+df2 = pd.read_csv(csv2_path)
+
+# 두 데이터프레임을 행 방향으로 결합
+combined_df = pd.concat([df1, df2], ignore_index=True)
+
+# 결과를 새로운 CSV 파일로 저장
+combined_df.to_csv('./data/final/pipeline1_final.csv', index=False)
+
+print("두 CSV 파일이 행으로 합쳐진 CSV 파일이 생성되었습니다.")
diff --git a/src/pipeline1/preprocessing/extract_unique_ids.py b/src/pipeline1/preprocessing/extract_unique_ids.py
@@ -0,0 +1,20 @@
+import pandas as pd
+
+train_path = "./data/raw/train.csv"  # 예시 파일 경로
+csv_path = "./data/preprocessed/pipeline1_step2.csv"
+
+# 두 CSV 파일 읽기
+df1 = pd.read_csv(train_path)
+df2 = pd.read_csv(csv_path)
+
+# ID 열에서 겹치지 않는 행만 선택 (양쪽 데이터프레임에 없는 ID)
+unique_df1 = df1[~df1['ID'].isin(df2['ID'])]
+unique_df2 = df2[~df2['ID'].isin(df1['ID'])]
+
+# 겹치지 않는 ID 행들을 하나의 데이터프레임으로 합치기
+unique_rows = pd.concat([unique_df1, unique_df2])
+
+# 결과를 새로운 CSV 파일로 저장
+unique_rows.to_csv('./data/preprocessed/unique_ids.csv', index=False)
+
+print("겹치지 않는 ID만 저장된 CSV 파일이 생성되었습니다.")
diff --git a/src/pipeline1/preprocessing/filter_by_asciiCondition.py b/src/pipeline1/preprocessing/filter_by_asciiCondition.py
@@ -0,0 +1,76 @@
+import pandas as pd
+import re
+
+
+"""
+Filters rows in a CSV file based on specific special character and pattern conditions, 
+and saves the filtered results to a new CSV file.
+
+This script performs the following tasks:
+1. Defines a set of special characters to filter.
+2. Loads a CSV file containing text data.
+3. Applies filtering conditions based on:
+    - Presence of specified special characters.
+    - A '%' character that does not have a preceding digit.
+    - A '.' character that:
+        - Is not preceded by '다'.
+        - Does not have digits on both sides.
+4. Saves the filtered rows to a new CSV file.
+
+Attributes:
+    special_char_input (str): A string of special characters to filter.
+    input_csv_path (str): Path to the input CSV file.
+    output_csv_path (str): Path to the output CSV file where filtered results are saved.
+
+Variables:
+    df (pd.DataFrame): DataFrame loaded from the input CSV file.
+    special_chars (str): Regex pattern for specified special characters.
+    percent_condition (str): Regex pattern for '%' without a preceding digit.
+    dot_condition (str): Regex pattern for '.' that does not meet specified conditions.
+    filter_pattern (str): Combined regex pattern of all filtering conditions.
+    filtered_df (pd.DataFrame): DataFrame containing rows that match the filter criteria.
+
+Usage:
+    Run this script to filter rows from the input CSV file based on the defined conditions.
+    The filtered data is saved in the output CSV file.
+
+Example:
+    ```
+    python filter_special_chars.py
+    ```
+    This will save the filtered data in `output_csv_path`.
+
+Returns:
+    None
+"""
+
+
+# 특수문자 필터 설정
+special_char_input = "]^[<>@'+/;!=\#`)~$*}{|:&_?,(-\""  # 찾고자 하는 특수문자 목록
+input_csv_path = "./data/preprocessed/pipeline1_step1.csv"  # 원본 CSV 파일 경로
+output_csv_path = "./data/preprocessed/pipeline1_step2.csv"  # 필터링된 결과를 저장할 CSV 파일 경로
+
+# CSV 파일 로드
+df = pd.read_csv(input_csv_path)
+
+# 특수문자 조건 생성
+special_chars = f"[{re.escape(special_char_input)}]"
+
+# 추가 조건 정의
+# '%'가 있을 때 앞에 숫자가 오지 않는 경우
+percent_condition = r"(?<!\d)%"
+
+# '.'가 있을 때:
+# 1) '.' 앞에 '다'가 아닌 경우
+# 2) '.'의 앞뒤에 숫자가 아닌 경우
+dot_condition = r"(?<!다)\.|(?<!\d)\.(?!\d)"
+
+# 정규 표현식을 통해 필터링 조건 생성
+filter_pattern = f"{special_chars}|{percent_condition}|{dot_condition}"
+
+# 조건에 맞는 행을 필터링
+filtered_df = df[df['text'].str.contains(filter_pattern, regex=True, na=False)]
+
+# 필터링된 데이터 저장
+filtered_df.to_csv(output_csv_path, index=False)
+print(f"Filtered data saved to {output_csv_path}")
diff --git a/src/pipeline1/preprocessing/filter_by_asciiRatio.py b/src/pipeline1/preprocessing/filter_by_asciiRatio.py
@@ -0,0 +1,29 @@
+import pandas as pd
+import re
+
+def calculate_special_char_ratio(text):
+    """텍스트에서 아스키코드 33~126번에 해당하는 특수문자 비율을 계산하는 함수"""
+    if not isinstance(text, str):
+        return 0
+    # ASCII 코드 33번부터 126번까지의 특수문자 범위에 해당하는 문자들
+    special_chars = re.findall(r"[!-~]", text)
+    # 특수문자 비율 계산
+    return len(special_chars) / len(text) if len(text) > 0 else 0
+
+def filter_by_special_char_ratio(df, text_column="text", min_ratio=0, max_ratio=0.5):
+    """특수문자 비율이 특정 범위에 있는 행만 필터링하는 함수"""
+    filtered_df = df[df[text_column].apply(lambda x: min_ratio <= calculate_special_char_ratio(x) <= max_ratio)]
+    return filtered_df
+
+
+# 파일 로드
+input_path = "./data/preprocessed/pipeline1_step2.csv"  # 예시 파일 경로
+output_path = "./data/preprocessed/pipeline1_step3.csv"
+df = pd.read_csv(input_path)
+
+# 20% ~ 80% 범위의 데이터 필터링
+filtered_df = filter_by_special_char_ratio(df)
+
+# 파일 저장
+filtered_df.to_csv(output_path, index=False)
+print(f"Filtered file with special character ratio between 0% and 50% saved to {output_path}")
diff --git a/src/pipeline1/preprocessing/preprocess_ascii_engWord.py b/src/pipeline1/preprocessing/preprocess_ascii_engWord.py
@@ -0,0 +1,102 @@
+import pandas as pd
+from collections import Counter
+import re
+
+"""
+Filters and processes text data in a CSV file based on specified word frequency and special character conditions, 
+then saves the processed data to a new CSV file.
+
+This script performs the following tasks:
+1. Loads text data from a CSV file.
+2. Identifies English words in uppercase with two or more letters and counts their frequency across the text data.
+3. Filters words that appear with a frequency of `min_frequency` or more (default is 2).
+4. Replaces special characters in the text with spaces, while preserving the filtered high-frequency words.
+5. Saves the processed text data to a new CSV file.
+
+Attributes:
+    input_csv_path (str): Path to the input CSV file containing text data.
+    output_csv_path (str): Path to the output CSV file where processed results are saved.
+
+Functions:
+    get_filtered_words(df, min_frequency=2):
+        Extracts and filters uppercase English words with a minimum frequency of `min_frequency`.
+        Args:
+            df (pd.DataFrame): The input DataFrame containing text data.
+            min_frequency (int): Minimum frequency for a word to be preserved.
+        Returns:
+            set: A set of words with frequency greater than or equal to `min_frequency`.
+
+    replace_special_chars_conditionally(text):
+        Replaces special characters in the text with spaces, preserving words in `filtered_words`.
+        Args:
+            text (str): A string of text to be processed.
+        Returns:
+            str: Processed text with special characters replaced by spaces, preserving high-frequency words.
+
+Usage:
+    Run this script to process text in a CSV file, where:
+    - High-frequency uppercase words with two or more letters are preserved.
+    - Special characters are replaced with spaces except in the preserved words.
+    The processed data is saved to `output_csv_path`.
+
+Example:
+    ```
+    python filter_and_replace_special_chars.py
+    ```
+    This will save the processed data in `output_csv_path`.
+
+Returns:
+    None
+"""
+
+
+
+# CSV 파일 경로
+input_csv_path = "./data/preprocessed/pipeline1_step3.csv"
+output_csv_path = "./data/preprocessed/pipeline1_step4.csv"
+
+# CSV 파일 로드
+df = pd.read_csv(input_csv_path)
+
+# 영어 단어 빈도를 계산하고 필터링하는 함수
+def get_filtered_words(df, min_frequency=2):
+    """텍스트에서 빈도 2 이상인 대문자 2글자 이상의 영어 단어를 필터링"""
+    word_counts = {}
+
+    # 각 텍스트에서 영어 단어 추출 및 빈도 계산
+    for text in df['text']:
+        words = re.findall(r'\b[A-Z]{2,}\b', str(text))  # 대문자 2글자 이상의 단어만 추출
+        for word in words:
+            if word in word_counts:
+                word_counts[word] += 1
+            else:
+                word_counts[word] = 1
+
+    # 빈도가 min_frequency 이상인 단어만 필터링하여 반환
+    filtered_words = {word for word, count in word_counts.items() if count >= min_frequency}
+    return filtered_words
+
+# 필터링된 단어 집합 생성
+filtered_words = get_filtered_words(df)
+
+# 특수문자를 대체하는 함수
+def replace_special_chars_conditionally(text):
+    """조건에 따라 특수문자를 스페이스로 대체하는 함수"""
+    if not isinstance(text, str):
+        return text
+
+    # 필터링된 단어는 그대로 두고 나머지 특수문자만 스페이스로 대체
+    # 패턴: 필터링된 단어는 제외하고, ASCII 33번~126번의 특수문자를 스페이스로 대체
+    pattern = r'\b(' + '|'.join(re.escape(word) for word in filtered_words) + r')\b|[!-~]'
+
+    # 조건을 만족하는 단어는 그대로 유지하고, 나머지 특수문자는 스페이스로 대체
+    return re.sub(pattern, lambda x: x.group(0) if x.group(1) else ' ', text)
+
+# 데이터프레임에 함수 적용
+df['text'] = df['text'].apply(replace_special_chars_conditionally)
+
+# 결과 저장
+df.to_csv(output_csv_path, index=False)
+print(f"Processed data saved to {output_csv_path}")
+
+