safetyspeech-app / download_datasets_direct.py
aryan012234's picture
Deployment ready SafetySpeech system
282ecd6
Raw
History Blame Contribute Delete
1.38 kB
import os
import pandas as pd
from huggingface_hub import hf_hub_download
def main():
os.makedirs('data/external', exist_ok=True)
# 1. Depression Reddit (Download raw CSV)
print("Downloading Depression Reddit dataset...")
dep_file = hf_hub_download(repo_id="mrjunos/depression-reddit-cleaned", repo_type="dataset", filename="depression_reddit_cleaned_ds.csv")
df_dep = pd.read_csv(dep_file)
# The CSV has columns 'text', 'labels'
df_dep['label'] = df_dep['labels'].replace({'depression': 'depressive'})
df_dep.to_csv('data/external/depression_reddit.csv', index=False)
print(f"Saved depression_reddit.csv ({len(df_dep)} rows)")
# 2. UCSD Hate Speech (Download the parquet file directly)
print("\nDownloading UCSD Hate Speech dataset...")
hate_file = hf_hub_download(repo_id="ucberkeley-dlab/measuring-hate-speech", repo_type="dataset", filename="data/train-00000-of-00001.parquet")
df_hate = pd.read_parquet(hate_file)[['text', 'hate_speech_score']].dropna()
df_hate['label'] = (df_hate['hate_speech_score'] > 0.5).map({True: 'hate_speech', False: 'normal'})
df_hate[['text', 'label']].to_csv('data/external/hate_speech_ucsd.csv', index=False)
print(f"Saved hate_speech_ucsd.csv ({len(df_hate)} rows)")
print("\nBoth external datasets downloaded successfully!")
if __name__ == "__main__":
main()