import os from huggingface_hub import login, HfApi, hf_hub_download from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError def download_hf_dataset(repo_id: str, local_dir: str, token: str = "#######", repo_type: str = "dataset") -> bool: """ Download a dataset from HuggingFace Hub to a local directory. Args: repo_id (str): Repository ID in format 'username/repo_name' local_dir (str): Local directory path to save the dataset token (str, optional): HuggingFace API token. If None, expects prior login repo_type (str): Type of repository, defaults to 'dataset' Returns: bool: True if download successful, False otherwise """ try: # Login if token is provided if token: login(token=token) # Ensure local directory exists os.makedirs(local_dir, exist_ok=True) # Initialize HfApi api = HfApi() # Download dataset api.snapshot_download( repo_id=repo_id, repo_type=repo_type, local_dir=local_dir, local_dir_use_symlinks=False # endpoint="https://hf-mirror.com" ) print(f"Successfully downloaded dataset {repo_id} to {local_dir}") return True except RepositoryNotFoundError: print(f"Error: Repository {repo_id} not found on HuggingFace Hub") return False except HfHubHTTPError as e: print(f"HTTP Error: {str(e)}") return False except Exception as e: print(f"Unexpected error occurred: {str(e)}") return False # Example usage if __name__ == "__main__": # Example parameters repo_id = "chaiting/pk-3976-L5" local_dir = "/home/hsichen/LLaMA-Factory/data/chaiting/pk-3976-L5" # Download dataset download_hf_dataset(repo_id, local_dir)