Vignette: Naive genotype calls and confidence scores
Vignette: Naive genotype calls and confidence scores
Author: Pierre Neuvial
Created: 2010-03-04
Last updated: 2011-03-07
The TumorBoost method for normalizing allele-specific copy number based on a pair of tumor-normal genotyping microarrays uses genotypes of the normal sample for normalizing allele B fractions in a tumor. Here we show how such genotypes can be obtained from allele B fractions in a normal sample. This is called "naive genotyping" because genotypes are simply called based on local minima of an estimate of the density of allele fractions in the normal sample, as described in Bengtsson, Neuvial, and Speed (2010). We also illustrate how associated genotype confidence score files can be created.
WARNING: This naive genotyper is intended for identifying heterozygous SNPs to be used by TumorBoost and/or parent-specific CN segmentation methods, which are methods that are very forgiving on genotyping errors scattered along the genome. It was not designed to be a per-SNP high-performing genotyper.
Data
totalAndFracBData/
TCGA,GBM,BeadStudio,XY/
HumanHap550/
TCGA-02-0001-01C-01D-0184-06,fracB.asb
TCGA-02-0001-01C-01D-0184-06,total.asb
**TCGA-02-0001-10A-01D-0184-06,fracB.asb**
TCGA-02-0001-10A-01D-0184-06,total.asb
TCGA,GBM,CRMAv2/
GenomeWideSNP/
TCGA-02-0001-01C-01D-0182-01,fracB.asb
TCGA-02-0001-01C-01D-0182-01,total.asb
**TCGA-02-0001-10A-01D-0182-01,fracB.asb**
TCGA-02-0001-10A-01D-0182-01,total.asb
annotationData/
chipTypes/
HumanHap550/
HumanHap550,TCGA,HB20080512.ugp
HumanHap550,TCGA,HB20100107,unitNames.txt
GenomeWideSNP_6/
GenomeWideSNP_6,Full.CDF
GenomeWideSNP_6,Full,na26,HB20080821.ugp
Note: above we are showing the same data sets as in the TumorBoost vignette, but in the present vignette only data files corresponding to allelic ratios in normal samples are used: here, "TCGA--10A-,fracB.asb", for normal blood.
Setup
library("aroma.cn")
log <- verbose <- Arguments$getVerbose(-8, timestamp=TRUE)
rootPath <- "totalAndFracBData"
rootPath <- Arguments$getReadablePath(rootPath)
dataSets <- c("TCGA,GBM,BeadStudio,XY", "TCGA,GBM,CRMAv2")
dataSet <- dataSets[1] ## Work with Illumina data
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Load the raw (tumor,normal) data set
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ds <- AromaUnitFracBCnBinarySet$byName(dataSet, chipType="*", paths=rootPath)
setFullNamesTranslator(ds, function(names, ...) {
pattern <- "^(TCGA-[0-9]{2}-[0-9]{4})-([0-9]{2}[A-Z])[-]*(.*)"
gsub(pattern, "\\1,\\2,\\3", names)
})
print(ds)
This gives
AromaUnitFracBCnBinarySet:
Name: TCGA
Tags: GBM,BeadStudio,XY
Full name: TCGA,GBM,BeadStudio,XY
Number of files: 2
Names: TCGA-02-0001, TCGA-02-0001 [2]
Path (to the first file): totalAndFracBData/TCGA,GBM,BeadStudio,XY/HumanHap550
Total file size: 4.28 MB
RAM: 0.00MB
Extract the normals
types <- sapply(ds, FUN=function(df) getTags(df)[1])
normals <- grep("(10|11)[A-Z]", types)
dsN <- ds[normals]
print(dsN)
This gives
AromaUnitFracBCnBinarySet:
Name: TCGA
Tags: GBM,BeadStudio,XY
Full name: TCGA,GBM,BeadStudio,XY
Number of files: 1
Names: TCGA-02-0001 [1]
Path (to the first file): totalAndFracBData/TCGA,GBM,BeadStudio,XY/HumanHap550
Total file size: 2.14 MB
RAM: 0.00MB
Naive genotype calling and associated confidence scores
fullname <- paste(c(getFullName(dsN), "NGC"), collapse=",")
chipType <- getChipType(dsN, fullname=FALSE)
outPath <- file.path("callData", fullname, chipType)
units <- NULL
if (is.null(units)) {
df <- dsN[[1]]
units <- seq(length=nbrOfUnits(df))
}
adjust <- 1.5
# Identify units on ChrX and ChrY
ugp <- getAromaUgpFile(dsN)
units23 <- getUnitsOnChromosome(ugp, 23)
is23 <- is.element(units, units23)
units24 <- getUnitsOnChromosome(ugp, 24)
is24 <- is.element(units, units24)
kk <- 1
dfN <- dsN[[kk]]
tags <- getTags(dfN)
tags <- setdiff(tags, "fracB")
tags <- c(tags, "genotypes")
fullname <- paste(c(getName(dfN), tags), collapse=",")
filename <- sprintf("%s.acf", fullname)
gcPathname <- Arguments$getWritablePathname(filename, path=outPath, mustNotExist=FALSE)
csTags <- c(tags, "confidenceScores")
fullname <- paste(c(getName(dfN), csTags), collapse=",")
filename <- sprintf("%s.acf", fullname)
csPathname <- Arguments$getWritablePathname(filename, path=outPath, mustNotExist=FALSE)
if (isFile(gcPathname) && isFile(csPathname)) {
next
}
betaN <- dfN[units,1,drop=TRUE]
# Call gender
gender <- callXXorXY(betaN[is23], betaN[is24], adjust=adjust, from=0, to=1)
# Call genotypes
naValue <- as.double(NA)
fit <- NULL
mu <- rep(naValue, times=length(units))
cs <- rep(naValue, times=length(units))
if (gender == "XY") {
# All but ChrX & ChrY in male
isDiploid <- (!(is23 | is24))
use <- which(isDiploid)
muT <- callNaiveGenotypes(betaN[use], cn=2, adjust=adjust, from=0, to=1,
verbose=less(verbose,10))
fit <- attr(muT, 'modelFit')
mu[use] <- muT
use <- which(!isDiploid)
muT <- callNaiveGenotypes(betaN[use], cn=1, adjust=adjust, from=0, to=1,
verbose=less(verbose,10))
mu[use] <- muT
} else {
# All but ChrY in female
isDiploid <- (!is24)
use <- which(isDiploid)
muT <- callNaiveGenotypes(betaN[use], cn=2, adjust=adjust, from=0, to=1,
verbose=less(verbose,10))
fit <- attr(muT, 'modelFit')
mu[use] <- muT
}
print(table(mu, exclude=NULL))
# Translate genotype calls in fracB space to (AA,AB,BB,...)
calls <- rep(as.character(NA), times=length(mu))
calls[mu == 0] <- "AA"
calls[mu == 1/2] <- "AB"
calls[mu == 1] <- "BB"
print(table(calls, exclude=NULL))
# Calculate confidence scores
a <- fit[[1]]$fitValleys$x[1]
b <- fit[[1]]$fitValleys$x[2]
cs[isDiploid] <- rowMins(abs(cbind(betaN[isDiploid]-a, betaN[isDiploid]-b)))
print(table(mu, exclude=NULL))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Writing genotype calls (via temporary file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
pathname <- gcPathname
pathnameT <- sprintf("%s.tmp", pathname)
nbrOfUnits <- nbrOfUnits(dfN)
gfN <- AromaUnitGenotypeCallFile$allocate(pathnameT, platform=getPlatform(dfN), chipType=getChipType(dfN), nbrOfRows=nbrOfUnits)
footer <- readFooter(gfN)
footer$method <- "NaiveGenotypeCaller"
writeFooter(gfN, footer)
updateGenotypes(gfN, units=units, calls=calls)
res <- file.rename(pathnameT, pathname)
if (!isFile(pathname)) {
throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname)
}
if (isFile(pathnameT)) {
throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname)
}
gfN <- AromaUnitGenotypeCallFile(pathname)
print(gfN)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Writing confidence scores (via temporary file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
pathname <- csPathname
pathnameT <- sprintf("%s.tmp", pathname)
nbrOfUnits <- nbrOfUnits(dfN)
csfN <- AromaUnitSignalBinaryFile$allocate(pathnameT, platform=getPlatform(dfN), chipType=getChipType(dfN), nbrOfRows=nbrOfUnits, types="double", size=4, signed=TRUE)
footer <- readFooter(csfN)
footer$method <- "NaiveGenotypeConfidenceScoreEstimator"
writeFooter(csfN, footer)
csfN[units, 1] <- cs
res <- file.rename(pathnameT, pathname)
if (!isFile(pathname)) {
throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname)
}
if (isFile(pathnameT)) {
throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname)
}
cfN <- AromaUnitSignalBinaryFile(pathname)
print(cfN)
The data sets created are set up as follows:
callData/
TCGA,GBM,BeadStudio,XY,NGC/
HumanHap550/
TCGA-02-0001,10A,01D-0184-06,genotypes,confidenceScores.acf
TCGA-02-0001,10A,01D-0184-06,genotypes.acf
They can be loaded by:
gcN <- AromaUnitGenotypeCallSet$byName(dataSet, tags="NGC", chipType="*")
print(gcN)
This gives:
AromaUnitGenotypeCallSet:Name: TCGA
Tags: GBM,BeadStudio,XY,NGC
Full name: TCGA,GBM,BeadStudio,XY,NGC
Number of files: 1
Names: TCGA-02-0001 [1]
Path (to the first file): callData/TCGA,GBM,BeadStudio,XY,NGC/HumanHap550
Total file size: 1.07 MB
RAM: 0.00MB
csN <- AromaUnitSignalBinarySet$byName(dataSet, tags="NGC", chipType="*", pattern="confidenceScores", paths="callData")
print(csN)
This gives:
AromaUnitSignalBinarySet:
Name: TCGA
Tags: GBM,BeadStudio,XY,NGC
Full name: TCGA,GBM,BeadStudio,XY,NGC
Number of files: 1
Names: TCGA-02-0001 [1]
Path (to the first file): callData/TCGA,GBM,BeadStudio,XY,NGC/HumanHap550
Total file size: 2.14 MB
RAM: 0.00MB
References
[1] H. Bengtsson, P. Neuvial, and T. P. Speed. "TumorBoost: normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays". Eng. In: BMC bioinformatics 11 (May. 2010), p. 245. ISSN: 1471-2105. DOI: 10.1186/1471-2105-11-245. PMID: 20462408.