| parse_one_substitute <- function(aaChg, wt.sequence, offset=0, seq.lim=1001) { |
| if (is.na(wt.sequence)) { |
| ref = NA |
| pos = NA |
| alt = NA |
| wt = NA |
| sequence = NA |
| sequence.len = NA |
| seq.start = NA |
| seq.end = NA |
| pos.orig = NA |
| sequence.orig = NA |
| wt.orig = NA |
| sequence.len.orig = NA |
| } else { |
| protein.dictionary <- c( |
| "A"="Ala", "R"="Arg", "N"="Asn", "D"="Asp", "C"="Cys", "Q"="Gln", "E"="Glu", |
| "G"="Gly", "H"="His", "I"="Ile", "L"="Leu", "K"="Lys", "M"="Met", "F"="Phe", |
| "P"="Pro", "O"="Pyl", "S"="Ser", "U"="Sec", "T"="Thr", "W"="Trp", "Y"="Tyr", |
| "V"="Val", "B"="Asx", "Z"="Glx", "X"="Xaa", "J"="Xle" |
| ) |
| protein.reverse.dictionary <- names(protein.dictionary) |
| names(protein.reverse.dictionary) <- protein.dictionary |
| pos_raw <- regmatches(aaChg, gregexpr('[0-9]+', aaChg))[[1]] |
| pos <- as.numeric(pos_raw) + offset |
| if (length(pos) == 0) { |
| |
| ref <- NA |
| pos <- NA |
| alt <- NA |
| if (aaChg == "p.=" | aaChg == "p.(=)" | aaChg == "_wt") { |
| newseq <- wt.sequence |
| } else { |
| newseq <- NA |
| } |
| } else if (length(pos) == 2 & !grepl('fs', aaChg)) { |
| |
| ref_aa_start <- strsplit(substr(aaChg, 3, |
| nchar(aaChg)), |
| split = pos_raw[1])[[1]][1] |
| remains <- strsplit(substr(aaChg, 3, |
| nchar(aaChg)), |
| split = pos_raw[1])[[1]][2] |
| ref_aa_end <- strsplit(substr(remains, 2, |
| nchar(remains)), |
| split = pos_raw[2])[[1]][1] |
| remains <- strsplit(substr(remains, 2, |
| nchar(remains)), |
| split = pos_raw[2])[[1]][2] |
| if (remains == "del") { |
| alt <- "" |
| } else { |
| alt <- strsplit(remains, split = 'delins')[[1]][2] |
| } |
| if (nchar(ref_aa_start) > 1) { |
| ref_aa_start <- as.character(protein.reverse.dictionary[ref_aa_start]) |
| } |
| if (nchar(ref_aa_end) > 1) { |
| ref_aa_end <- as.character(protein.reverse.dictionary[ref_aa_end]) |
| } |
| if (nchar(alt) > 1) { |
| alt <- as.character(protein.reverse.dictionary[alt]) |
| } |
| if (ref_aa_start == substr(wt.sequence, pos[1], pos[1]) & |
| ref_aa_end == substr(wt.sequence, pos[2], pos[2])) { |
| newseq <- wt.sequence |
| ref <- substr(wt.sequence, pos[1], pos[2]) |
| substr(newseq, pos[1], pos[1]) <- alt |
| for (i in (pos[1]+1):pos[2]) { |
| newseq <- paste(unlist(strsplit(newseq, ""))[-(pos[1]+1)], collapse = "") |
| } |
| pos <- pos[1] |
| } else { |
| ref <- substr(wt.sequence, pos[1], pos[2]) |
| alt <- NA |
| newseq <- NA |
| pos <- NA |
| } |
| } else if (length(pos) == 2 & grepl('fs', aaChg)) { |
| pos <- pos[1] |
| ref <- substr(wt.sequence, pos, pos) |
| alt <- NA |
| newseq <- NA |
| } else if (pos > nchar(wt.sequence)) { |
| |
| ref <- NA |
| pos <- pos |
| alt <- NA |
| if (aaChg == "p.=" | aaChg == "p.(=)" | aaChg == "_wt") { |
| newseq <- wt.sequence |
| } else { |
| newseq <- NA |
| } |
| } else { |
| |
| ref_alt_raw <- strsplit(substr(aaChg, 3, |
| nchar(aaChg)), |
| split = pos_raw)[[1]] |
| ref <- ref_alt_raw[1] |
| if (nchar(ref) > 1) { |
| ref <- as.character(protein.reverse.dictionary[ref]) |
| } |
| if (ref == substr(wt.sequence, pos, pos)) { |
| newseq <- wt.sequence |
| if (ref_alt_raw[2] == "~" | ref_alt_raw[2] == "del") { |
| alt <- NA |
| newseq <- paste(unlist(strsplit(wt.sequence, ""))[-pos], collapse = "") |
| } else if (ref_alt_raw[2] == "*" | ref_alt_raw[2] == "Ter") { |
| alt <- NA |
| newseq <- substr(wt.sequence, 1, pos-1) |
| } else if (ref_alt_raw[2] == "=") { |
| |
| alt <- NA |
| } else { |
| alt <- ref_alt_raw[2] |
| if (nchar(alt) > 1) { |
| alt <- as.character(protein.reverse.dictionary[alt]) |
| } |
| substr(newseq, pos, pos) <- alt |
| } |
| } else { |
| ref <- substr(wt.sequence, pos, pos) |
| alt <- NA |
| newseq <- NA |
| } |
| } |
| if (!is.na(newseq) & nchar(newseq)<=1) { |
| newseq <- NA |
| } |
| |
| sequence.len.orig <- nchar(newseq) |
| sequence.orig <- newseq |
| pos.orig <- pos |
| wt.orig <- wt.sequence |
| if (!is.na(sequence.len.orig) & |
| !is.na(pos.orig) & |
| (sequence.len.orig > seq.lim | nchar(wt.orig) > seq.lim)) { |
| sequence.len <- seq.lim |
| if (pos.orig < (seq.lim+1)/2) { |
| sequence <- substr(sequence.orig, 1, seq.lim) |
| wt <- substr(wt.orig, 1, seq.lim) |
| pos <- pos.orig |
| seq.start <- 1 |
| seq.end <- seq.lim |
| } else if (pos.orig + (seq.lim-1)/2 > sequence.len.orig) { |
| sequence <- substr(sequence.orig, |
| sequence.len.orig-seq.lim+1, |
| sequence.len.orig) |
| wt <- substr(wt.orig, |
| sequence.len.orig-seq.lim+1, |
| sequence.len.orig) |
| pos <- pos.orig - sequence.len.orig + seq.lim |
| seq.start <- sequence.len.orig - seq.lim + 1 |
| seq.end <- sequence.len.orig |
| } else { |
| sequence <- substr(sequence.orig, |
| pos.orig-(seq.lim-1)/2, |
| pos.orig+(seq.lim-1)/2) |
| wt <- substr(wt.sequence, |
| pos.orig-(seq.lim-1)/2, |
| pos.orig+(seq.lim-1)/2) |
| pos <- (seq.lim+1)/2 |
| seq.start <- pos.orig-(seq.lim-1)/2 |
| seq.end <- pos.orig+(seq.lim-1)/2 |
| } |
| } else { |
| sequence.len <- sequence.len.orig |
| sequence <- sequence.orig |
| wt <- wt.sequence |
| pos <- pos.orig |
| seq.start <- 1 |
| seq.end <- sequence.len.orig |
| } |
| } |
| result <- list(ref=ref, pos=pos, alt=alt, |
| wt = wt, |
| sequence = sequence, |
| sequence.len = sequence.len, |
| seq.start = seq.start, |
| seq.end = seq.end, |
| pos.orig = pos.orig, |
| sequence.orig = sequence.orig, |
| wt.orig = wt.orig, |
| sequence.len.orig = sequence.len.orig) |
| result |
| } |