emotion / train_cloud.py

Upload folder using huggingface_hub

af9853e verified 4 months ago

8.08 kB


	import os
	import sys
	import torch
	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd
	from datasets import load_dataset, concatenate_datasets
	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	TrainingArguments,
	Trainer
	)
	from sklearn.metrics import accuracy_score, precision_recall_fscore_support

	# ==========================================
	# 1. 配置 (Configuration)
	# ==========================================
	class Config:
	# 基础模型
	BASE_MODEL = "google-bert/bert-base-chinese"

	# 目录配置 (根据用户要求指定)
	BASE_DIR = os.getcwd()
	DATA_DIR = os.path.join(BASE_DIR, "data")
	CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
	RESULTS_DIR = os.path.join(BASE_DIR, "results")
	DOCS_DIR = os.path.join(BASE_DIR, "docs")

	# 标签配置
	NUM_LABELS = 3
	LABEL2ID = {'negative': 0, 'neutral': 1, 'positive': 2}
	ID2LABEL = {0: 'negative', 1: 'neutral', 2: 'positive'}

	# 训练参数
	MAX_LENGTH = 128
	BATCH_SIZE = 32
	LEARNING_RATE = 2e-5
	NUM_EPOCHS = 3
	WARMUP_RATIO = 0.1
	SAVE_STEPS = 500
	LOGGING_STEPS = 100

	# ==========================================
	# 2. 工具函数 (Utils)
	# ==========================================
	def ensure_directories():
	""" 确保所有必要的目录存在 """
	for path in [Config.DATA_DIR, Config.CHECKPOINT_DIR, Config.RESULTS_DIR, Config.DOCS_DIR]:
	if not os.path.exists(path):
	os.makedirs(path)
	print(f">>> Created directory: {path}")

	def plot_training_history(log_history, save_path):
	""" 绘制训练曲线并保存 """
	try:
	# 设置字体 (尝试通用中文字体，云端可能缺失，回退到英文)
	plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
	plt.rcParams['axes.unicode_minus'] = False

	df = pd.DataFrame(log_history)
	train_loss = df[df['loss'].notna()]
	eval_acc = df[df['eval_accuracy'].notna()]

	if train_loss.empty:
	return

	plt.figure(figsize=(12, 5))

	# Loss
	plt.subplot(1, 2, 1)
	plt.plot(train_loss['epoch'], train_loss['loss'], label='Train Loss', color='#FF6B6B')
	if 'eval_loss' in df.columns:
	eval_loss = df[df['eval_loss'].notna()]
	plt.plot(eval_loss['epoch'], eval_loss['eval_loss'], label='Val Loss', color='#4ECDC4')
	plt.title('Loss Curve')
	plt.xlabel('Epoch')
	plt.ylabel('Loss')
	plt.legend()
	plt.grid(True, alpha=0.3)

	# Accuracy
	if not eval_acc.empty:
	plt.subplot(1, 2, 2)
	plt.plot(eval_acc['epoch'], eval_acc['eval_accuracy'], label='Val Accuracy', color='#6BCB77', marker='o')
	plt.title('Accuracy Curve')
	plt.xlabel('Epoch')
	plt.ylabel('Accuracy')
	plt.legend()
	plt.grid(True, alpha=0.3)

	plt.tight_layout()
	plt.savefig(save_path)
	print(f">>> Plot saved to {save_path}")
	plt.close()
	except Exception as e:
	print(f"Warning: Plotting failed ({e})")

	# ==========================================
	# 3. 数据处理 (Data Processor)
	# ==========================================
	class DataProcessor:
	def __init__(self, tokenizer):
	self.tokenizer = tokenizer

	def clean_data(self, example):
	text = example['text']
	if text is None: return False
	if "此用户未填写评价内容" in text: return False
	if len(text.strip()) < 2: return False
	return True

	def unify_labels(self, example):
	label = example['label']
	if isinstance(label, str):
	label = label.lower()
	if label in ['negative', 'pos', '0']: return {'label': 0}
	elif label in ['neutral', 'neu', '1']: return {'label': 1}
	elif label in ['positive', 'neg', '2']: return {'label': 2}
	return {'label': int(label)}

	def tokenize_function(self, examples):
	return self.tokenizer(examples['text'], padding="max_length", truncation=True, max_length=Config.MAX_LENGTH)

	def get_dataset(self):
	print(">>> Loading Datasets...")
	# 指定 cache_dir 为 data 目录
	ds_clap = load_dataset("clapAI/MultiLingualSentiment", split="train", trust_remote_code=True, cache_dir=Config.DATA_DIR)
	ds_med = load_dataset("OpenModels/Chinese-Herbal-Medicine-Sentiment", split="train", trust_remote_code=True, cache_dir=Config.DATA_DIR)

	# 列对齐
	if 'review_text' in ds_med.column_names: ds_med = ds_med.rename_column('review_text', 'text')
	if 'sentiment_label' in ds_med.column_names: ds_med = ds_med.rename_column('sentiment_label', 'label')
	if 'language' in ds_clap.column_names: ds_clap = ds_clap.filter(lambda x: x['language'] == 'zh')

	common_cols = ['text', 'label']
	combined = concatenate_datasets([ds_clap.select_columns(common_cols), ds_med.select_columns(common_cols)])

	# 清洗与处理
	combined = combined.filter(self.clean_data).map(self.unify_labels)
	tokenized = combined.map(self.tokenize_function, batched=True, remove_columns=['text', 'label'])

	return tokenized.train_test_split(test_size=0.1)

	# ==========================================
	# 4. Metrics
	# ==========================================
	def compute_metrics(pred):
	labels = pred.label_ids
	preds = pred.predictions.argmax(-1)
	precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
	acc = accuracy_score(labels, preds)
	return {'accuracy': acc, 'f1': f1}

	# ==========================================
	# 5. 主流程
	# ==========================================
	def main():
	print("=== Cloud Training Script ===")
	ensure_directories()

	if torch.cuda.is_available():
	print(f"✅ CUDA Enabled: {torch.cuda.get_device_name(0)}")
	else:
	print("⚠️ Running on CPU")

	tokenizer = AutoTokenizer.from_pretrained(Config.BASE_MODEL)
	processor = DataProcessor(tokenizer)
	dataset = processor.get_dataset()

	model = AutoModelForSequenceClassification.from_pretrained(
	Config.BASE_MODEL,
	num_labels=Config.NUM_LABELS,
	id2label=Config.ID2LABEL,
	label2id=Config.LABEL2ID
	)

	training_args = TrainingArguments(
	output_dir=Config.CHECKPOINT_DIR, # Checkpoints 存放在这里
	num_train_epochs=Config.NUM_EPOCHS,
	per_device_train_batch_size=Config.BATCH_SIZE,
	per_device_eval_batch_size=Config.BATCH_SIZE,
	learning_rate=Config.LEARNING_RATE,
	warmup_ratio=Config.WARMUP_RATIO,
	logging_dir=os.path.join(Config.RESULTS_DIR, 'logs'), # Logs 存放在 Results
	logging_steps=Config.LOGGING_STEPS,
	eval_strategy="steps",
	eval_steps=Config.SAVE_STEPS,
	save_steps=Config.SAVE_STEPS,
	save_total_limit=2,
	load_best_model_at_end=True,
	metric_for_best_model="f1",
	fp16=torch.cuda.is_available(),
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=dataset['train'],
	eval_dataset=dataset['test'],
	processing_class=tokenizer,
	compute_metrics=compute_metrics,
	)

	print(">>> Starting Training...")
	trainer.train()

	# 保存最终模型到 checkpoints/final_model
	final_path = os.path.join(Config.CHECKPOINT_DIR, "final_model")
	print(f">>> Saving Final Model to {final_path}...")
	trainer.save_model(final_path)
	tokenizer.save_pretrained(final_path)

	# 绘制曲线到 results/
	print(">>> Generating Plots...")
	plot_path = os.path.join(Config.RESULTS_DIR, "training_curves_cloud.png")
	plot_training_history(trainer.state.log_history, plot_path)

	print(">>> All Done!")

	if __name__ == "__main__":
	main()