aroma.affymetrix 1.7.0
aroma.cn 0.5.0
What's new?
Author: Pierre Neuvial
Created: 2010-05-30
Modified: 2010-06-08
|
This vignette illustrates the TumorBoost method for normalizing allelic ratios from genotyping microarrays, which is described in Bengtsson et al. (2010). The input of the method is a single tumor-normal pair of allelic ratios from genotyping microarrays.
|
Figure: Allele B fractions before and after TumorBoost normalization. |
Note: In the above data sets, files corresponding to both allelic ratios ('fracB') and total intensities ('total') are listed. Total intensities are only used at the end of this vignette to display total copy number profiles. Total intensities are not used for TumorBoost normalization, and total copy numbers are not affected by TumorBoost normalization.
To normalize a tumor sample, TumorBoost uses genotype calls (and optionally genotype confidence scores) of the matched normal sample. In this vignette we assume that such genotype calls are available to us (see 'callData' above). In the vignette 'Naive genotype calls and associated confidence scores' we show how such genotype calls can be estimated only based on allelic ratios in the normal sample, as described in the TumorBoost paper.
To use other genotype calls than the naive ones, a data set with similar structure as above has to be created, for example:
The creation of such a data set depends on the input data (Birdseed or BeadStudio genotypes for example). It is not documented here.
For Affymetrix genotyping microarrays, input data can be obtained using the CRMA v2 method (Bengtsson et al. 2009), e.g.
ds <- doASCRMAv2("TCGA,GBM", chipType="GenomeWideSNP_6,Full");
Setup
library("aroma.cn");
log <- verbose <- Arguments$getVerbose(-8, timestamp=TRUE);
rootPath <- "totalAndFracBData";
rootPath <- Arguments$getReadablePath(rootPath);
dataSets <- c("TCGA,GBM,BeadStudio,XY", "TCGA,GBM,CRMAv2");
dataSet <- dataSets[1]; ## Work with Illumina data
# Fullnames translator
fnt <- function(names, ...) {
pattern <- "^(TCGA-[0-9]{2}-[0-9]{4})-([0-9]{2}[A-Z])[-]*(.*)";
gsub(pattern, "\\1,\\2,\\3", names);
} # fnt()
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Load the raw (tumor,normal) data set # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ds <- AromaUnitFracBCnBinarySet$byName(dataSet, chipType="*", paths=rootPath); setFullNamesTranslator(ds, fnt); print(ds); sampleNames <- sort(unique(getNames(ds))); sampleName <- sampleNames[1]; pair <- indexOf(ds, sampleName); stopifnot(length(pair) == 2); # Order as (tumor,normal) types <- sapply(extract(ds,pair), FUN=function(df) getTags(df)[1]); o <- order(types); types <- types[o]; pair <- pair[o]; # Extract (tumor, normal) pair dsPair <- extract(ds, pair); dsT <- extract(dsPair, 1); print(dsT);
dsN <- extract(dsPair, 2); print(dsN);
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Load the genotype call set
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Identify available genotype calls
rootPath <- "callData";
rootPath <- Arguments$getReadablePath(rootPath);
genotypeTag <- "NGC";
gsN <- AromaUnitGenotypeCallSet$byName(dataSet, tags=genotypeTag, chipType="*");
setFullNamesTranslator(gsN, fnt);
# Keep only normal genotype files (not needed here, but could be needed in other situations)
types <- sapply(gsN, FUN=function(df) getTags(df)[1]);
types <- gsub("[A-Z]$", "", types);
keep <- which(is.element(types, c("10", "11")));
gsN <- extract(gsN, keep);
print(gsN);
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Create a list of matched data sets
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
dsList <- list(normal=dsN, tumor=dsT, callsN=gsN);
dsList <- lapply(dsList, FUN=function(ds) {
idxs <- indexOf(ds, getNames(dsList$normal));
extract(ds, idxs);
});
print(dsList);dummy <- lapply(dsList, FUN=function(ds) print(getFile(ds,1)));
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Normalize allele B fractions for tumors given matched normals
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tbn <- TumorBoostNormalization(dsList$tumor, dsList$normal, gcN=dsList$callsN, tags=c("*", "NGC"));
dsTN <- process(tbn, verbose=log);
setFullNamesTranslator(dsTN, fnt);
print(dsTN);
This has created the following data set:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Create a list of matched data sets
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
dsList <- list(normal=dsN, tumor=dsT, tumorN=dsTN, callsN=gsN);
rm(dsN, dsT, dsTN, gsN);
dsList <- lapply(dsList, FUN=function(ds) {
idxs <- indexOf(ds, getNames(dsList$normal));
extract(ds, idxs);
});figPath <- Arguments$getWritablePath("figures");
siteTag <- getTags(ds);
siteTag <- paste(siteTag[-1], collapse=",");
print(siteTag);
ugp <- getAromaUgpFile(dsList$tumor);
chromosome <- 2;
chrTag <- sprintf("Chr%02d", chromosome);
units <- getUnitsOnChromosome(ugp, chromosome=chromosome);
# Identify SNPs only
platform <- getPlatform(ugp);
if (platform == "Affymetrix") {
require("aroma.affymetrix") || throw("Package not loaded: aroma.affymetrix");
snpPattern <- "^SNP";
} else if (platform == "Illumina") {
snpPattern <- "^rs[0-9]";
} else {
throw("Unknown platform: ", platform);
}
unf <- getUnitNamesFile(ugp);
unitNames <- getUnitNames(unf, units=units);
# Identify SNP units
keep <- (regexpr(snpPattern, unitNames) != -1);
units <- units[keep];
pos <- getPositions(ugp, units=units);
# Extract Allele B fractions
kk <- 1;
dfList <- lapply(dsList, FUN=getFile, kk);
beta <- lapply(dfList, FUN=function(df) df[units,1,drop=TRUE]);
beta <- as.data.frame(beta);
beta <- as.matrix(beta);
names <- colnames(beta);
names[names == "tumorN"] <- "normalized tumor";
# Plot dimensions
x <- pos/1e6;
xlim <- range(x, na.rm=TRUE);
xlab <- "Position (Mb)";
width <- 840;
width <- 1280;
aspect <- 0.6*1/3;# Plot Allele B fractions
ylim <- c(-0.05,1.05);
ylim <- c(-0.1,1.1);
ylab <- "Allele B Fraction";
cols <- as.integer(beta[,"callsN"] != 1) + 1L;
for (cc in 1:3) {
tag <- colnames(beta)[cc];
name <- names[cc];
figName <- sprintf("%s,%s,%s,%s,fracB", siteTag, sampleName, tag, chrTag);
pathname <- filePath(figPath, sprintf("%s.png", figName));
if (!isFile(pathname)) {
fig <- devNew("png", pathname, label=figName, width=width, height=aspect*width);
par(mar=c(2.7,2.5,1.1,1)+0.1, tcl=-0.3, mgp=c(1.4,0.4,0), cex=2);
par(mar=c(1.7,2.5,1.1,1)+0.1);
plot(NA, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, axes=FALSE);
axis(side=1);
axis(side=2, at=c(0,1/2,1));
points(x, beta[,cc], pch=".", col=cols);
label <- sprintf("%s (%s)", sampleName, name);
stext(side=3, pos=0, label);
stext(side=3, pos=1, chrTag);
devDone();
}
}# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Load raw total CN data set # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rootPath <- "totalAndFracBData"; rootPath <- Arguments$getReadablePath(rootPath); dsC <- AromaUnitTotalCnBinarySet$byName(dataSet, chipType="*", paths=rootPath); setFullNamesTranslator(dsC, fnt); print(dsC);
pairC <- indexOf(dsC, sampleName);
stopifnot(length(pairC) == 2);
# Order as (tumor,normal)
types <- sapply(extract(dsC, pairC), FUN=function(df) getTags(df)[1]);
o <- order(types);
types <- types[o];
pairC <- pairC[o];
# Extract (tumor, normal) pair
dsPairC <- extract(dsC, pairC);
# Extract total CNs
C <- extractMatrix(dsPairC, units=units);
C <- 2*C[,1]/C[,2];
# Plot total CNs
ylim <- c(0,6);
ylab <- "Copy number";
figName <- sprintf("%s,%s,%s,CN", siteTag, sampleName, chrTag);
pathname <- filePath(figPath, sprintf("%s.png", figName));
if (!isFile(pathname)) {
fig <- devNew("png", pathname, label=figName, width=width, height=aspect*width);
par(mar=c(2.7,2.5,1.1,1)+0.1, tcl=-0.3, mgp=c(1.4,0.4,0), cex=2);
par(mar=c(1.7,2.5,1.1,1)+0.1);
plot(NA, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, axes=FALSE);
axis(side=1);
axis(side=2, at=c(0,2,4,6));
points(x, C, pch=".");
label <- sprintf("%s", sampleName);
stext(side=3, pos=0, label);
stext(side=3, pos=1, chrTag);
devDone();
}


