How to: Create a Unit Genome Position (UGP) file
Aroma UGP files
Aroma UGP files are binary files storing a (unit, chromosome, position) map in a tabular format. The unit indices are implicit, that is, they are not stored in the file but instead the rows are assumed to be in the order of the corresponding unit names file, which if you work with Affymetrix data is an Affymetrix CDF file.
Examples
Below are three different examples how one can create an UGP file from scratch. They all have in common that they allocate the UGP file using a CDF as a template. They differ in how they populate the UGP file.
Example #1 - Manually assign values
cdf <- AffymetrixCdfFile$byChipType("GenomeWideSNP_6")
# Creates an empty UGP file for the CDF, if missing.
ugp <- AromaUgpFile$allocateFromCdf(cdf, tags="HB20081121")
print(ugp)
## AromaUgpFile:
## Name: GenomeWideSNP_6
## Tags: HB20081121
## Pathname:
## annotationData/chipTypes/GenomeWideSNP_6/GenomeWideSNP_6,HB20081121.ugp
## File size: 8.85MB
## RAM: 0.00MB
## Number of data rows: 1856069
## File format: v1
## Dimensions: 1856069x2
## Column classes: integer, integer
## Number of bytes per column: 1, 4
## Footer: \<createdOn\>20081121 09:32:19
## PST</createdOn><platform>Affymetrix</platform>
## <chipType>GenomeWideSNP_6</chipType>
## Chip type: GenomeWideSNP_6
## Platform: Affymetrix
# The CDF names of some units for which we know the genomic locations (from other sources)
unitNames <- c("SNP_A-4214434", "SNP_A-2005029", "SNP_A-2005128", "CN_065226", "CN_568310")
# Identify the unit indices (defined by the CDF) of these units
units <- indexOf(cdf, names=unitNames)
print(units)
## [1] 1081 1082 1083 1506902 1506903
# The UGP file is all empty
print(ugp[units,])
## chromosome position
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
# Assign chromosome
ugp[units,1] <- c(1, 1, 1, 11, 11)
print(ugp[units,])
## chromosome position
## 1 1 NA
## 2 1 NA
## 3 1 NA
## 4 11 NA
## 5 11 NA
# Assign positions
ugp[units,2] <- c(38692010, 38831590, 38969651, 64488763, 64509535)
# Now data for units is in the UGP file
print(ugp[units,])
## chromosome position
## 1 1 38692010
## 2 1 38831590
## 3 1 38969651
## 4 11 64488763
## 5 11 64509535
Example #2 - Import from NetAffx files
cdf <- AffymetrixCdfFile$byChipType("GenomeWideSNP_6")
# Creates an empty UGP file for the CDF, if missing.
ugp <- AromaUgpFile$allocateFromCdf(cdf, tags=c("na23", "HB20081121"));
# Import NetAffx unit position data
csv <- AffymetrixNetAffxCsvFile$byChipType(chipType, tags=".na23")
units <- importFrom(ugp, csv)
str(units)
## int [1:934968] 334945 334944 334942 334941 334940 334939 334910 334937 ...
# Import NetAffx CN probe position data
csv <- AffymetrixNetAffxCsvFile$byChipType(chipType, tags=".cn.na23")
units <- importFrom(ugp, csv)
str(units)
## int [1:945826] 935622 935777 935671 935631 935625 935703 935698 935705 ...
# Available chromosomes
table(ugp[,1])
## 1 2 3 4 5 6 7 8 9 10
## 146524 153732 127815 120360 115731 112895 101093 98306 82225 93655
## 11 12 13 14 15 16 17 18 19 20
## 89615 87372 66106 57121 53595 54215 46678 52109 30362 43648
## 21 22 23 24
## 25129 24513 87204 9486
# Get units on chromosome X
units <- getUnitsAt(ugp, 23)
str(units)
## int [1:87204] 61101 61102 61103 61104 61105 61106 61107 61108 ...
Example #3 - Import from ASCII tabular files
You can import UGP data from any valid ASCII tabular file given that it contains at least three columns:
- Affymetrix unit names,
- chromosome, and
- chromosome positions.
It is easier if they are in that order, but not required.
cdf <- AffymetrixCdfFile$byChipType("GenomeWideSNP_6")
# Creates an empty UGP file for the CDF, if missing.
ugp <- AromaUgpFile$allocateFromCdf(cdf, tags=c("CSV", "HB20081121"))
# Load the ASCII tabular file
filename <- "GenomeWideSNP_6,my-own-tabular-ASCII-file.txt"
path <- "annotationData/chipTypes/GenomeWideSNP_6/myFiles/"
dat <- TabularTextFile(filename, path=path)
print(dat)
## TabularTextFile:
## Name: GenomeWideSNP_6
## Tags: my-own-tabular-ASCII-file
## Pathname: annotationData/chipTypes/GenomeWideSNP_6/myFiles/GenomeWideSNP_6,my-own-tabular-ASCII-file.txt
## File size: 1.51kB
## Columns [5]: 'Probe Set ID', 'Physical Position', 'Chromosome',
## 'Strand', 'dbSNP RS ID'
# Import data using regular expression matching of column names
# Note, the reordering of the imported columns.
colClassPatterns=c("^Probe Set ID$"="character", "^Chromosome$"="character", "^Physical Position$"="integer")
units <- importFrom(ugp, dat, colClassPatterns=colClassPatterns, colOrder=c(1,3,2))
str(units)
## int [1:39] 622 623 624 625 626 627 628 629 630 631 ...
print(ugp[units[1:10],])
## chromosome position
## 622 1 1145994
## 623 1 2224111
## 624 1 2319424
## 625 1 2543484
## 626 1 2926730
## 627 1 2941694
## 628 1 3084986
## 629 1 3155127
## 630 1 3292731
## 631 1 3695086
Example #4 - Import from dChip genome information files
cdf <- AffymetrixCdfFile$byChipType("Mapping250K_Nsp")
# Create an empty UGP file for this chip type
ugp <- AromaUgpFile$allocateFromCdf(cdf, tags=("dChip", "HB20081121"))
# Import data from the dChip genome information file
gi <- DChipGenomeInformation$byChipType(chipType)
units <- importFrom(ugp, gi)
# Get the chromosomes and physical positions for units 100:102
print(ugp[100:102,])
## 1 2
## 100 10 55075252
## 101 22 20865611
## 102 8 15317330
# Get the chromosomes and physical positions for three SNPs
snps <- c("SNP_A-1782949", "SNP_A-4192675", "SNP_A-1783398")
print(ugp[snps,])
## chromosome position
## 100 10 55075252
## 101 22 20865611
## 102 8 15317330
# Select all units on chromosome X @ 12.10-12.15Mb
subset(ugp, chromosome == 23 & 12.10e6 <= position & position <= 12.15e6)
## chromosome position
## 5294 23 1213762
## 6032 23 1211686
## 9373 23 1212709
## 11935 23 1213754
## 12794 23 1211867
## 13029 23 1211855
# The units in this regions
units <- as.integer(rownames(ugp))
units
## [1] 5294 6032 9373 11935 12794 13029