| import pandas as pd |
| import os |
|
|
| import torch |
| from transformers import RobertaTokenizerFast, RobertaForMaskedLM, DataCollatorWithPadding |
|
|
| import datasets |
| from datasets import disable_caching |
| disable_caching() |
|
|
| DEVICE = 'cuda:0' |
| ENCODER_MODEL_NAME = "entropy/roberta_zinc_480m" |
| ENCODER_BATCH_SIZE = 1024 |
|
|
| TOKENIZER_MAX_LEN = 256 |
| TOKENIZATION_NUM_PROC = 32 |
|
|
| ''' |
| Data source is expected to be a CSV file with a column of SMILES strings |
| denoted by `SMILES_COLUMN`. The CSV is processed in chunks of size `PROCESS_CHUNKSIZE`. |
| |
| Processed chunks are saved to `SAVE_PATH` with the format `SAVE_PATH/processed_shard_{i}.hf` |
| ''' |
|
|
| DATASET_CSV_FILENAME = None |
| PROCESS_CHUNKSIZE = 1000000 |
| SMILES_COLUMN = 'smiles' |
| MAX_CHUNKS = None |
| MAX_SMILES_LENGTH = 90 |
| MIN_SMILES_LENGTH = 5 |
| FILTER_NUM_PROC = 32 |
| SAVE_PATH = None |
|
|
| assert DATASET_CSV_FILENAME is not None, "must specify dataset filename" |
| assert SAVE_PATH is not None, "must specify save path" |
|
|
|
|
| def tokenization(example): |
| return tokenizer(example[SMILES_COLUMN], add_special_tokens=True, |
| truncation=True, max_length=TOKENIZER_MAX_LEN) |
|
|
| def embed(inputs): |
| inputs = {k:inputs[k] for k in ['input_ids', 'attention_mask']} |
| inputs = collator(inputs) |
| inputs = {k:v.to(DEVICE) for k,v in inputs.items()} |
| |
| with torch.no_grad(): |
| outputs = model(**inputs, output_hidden_states=True) |
| full_embeddings = outputs[-1][-1] |
| mask = inputs['attention_mask'] |
| |
| mean_embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1)) |
| |
| return {'encoder_hidden_states' : mean_embeddings} |
|
|
| def length_filter_smiles(example): |
| min_check = (len(example[SMILES_COLUMN])>MIN_SMILES_LENGTH) if (MIN_SMILES_LENGTH is not None) else True |
| max_check = (len(example[SMILES_COLUMN])<MAX_SMILES_LENGTH) if (MIN_SMILES_LENGTH is not None) else True |
| type_check = type(example[SMILES_COLUMN])==str |
| filter_pass = all([min_check, max_check, type_check]) |
| return filter_pass |
|
|
|
|
| tokenizer = RobertaTokenizerFast.from_pretrained(ENCODER_MODEL_NAME, max_len=TOKENIZER_MAX_LEN) |
| collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt') |
|
|
| model = RobertaForMaskedLM.from_pretrained(ENCODER_MODEL_NAME) |
| model.to(DEVICE) |
| model.eval() |
|
|
| df_iter = pd.read_csv(DATASET_CSV_FILENAME, chunksize=PROCESS_CHUNKSIZE, usecols=[SMILES_COLUMN]) |
|
|
| for i, df in enumerate(df_iter): |
| print(f'processing dataset chunk {i}') |
| |
| dataset = datasets.Dataset.from_pandas(df) |
| |
| dataset = dataset.filter(lambda example: length_filter_smiles(example), num_proc=FILTER_NUM_PROC) |
| |
| dataset = dataset.map(tokenization, batched=True, num_proc=TOKENIZATION_NUM_PROC) |
| |
| dataset = dataset.map(embed, batched=True, batch_size=ENCODER_BATCH_SIZE) |
| |
| dataset.save_to_disk(f'{SAVE_PATH}/processed_shard_{i}.hf') |
| |
| if (MAX_CHUNKS is not None) and (i >= MAX_CHUNKS-1): |
| break |
|
|
| print('finished data processing') |
|
|
|
|
|
|