#!/usr/bin/env python #============================================================ # The Broad Institute # SOFTWARE COPYRIGHT NOTICE AGREEMENT # This software and its documentation are copyright 2012 by # the Broad Institute/Massachusetts Institute of Technology. # All rights are reserved. # # This software is supplied without any warranty or guaranteed # support whatsoever. Neither the Broad Institute nor MIT can # be responsible for its use, misuse, or functionality. # # @author: William Mallard # @date: May 2012 #============================================================ import sys import csv from itertools import imap from optparse import OptionParser MAPTABLE = '/xchip/gdac_data/normalized/uuid/uuidToBarcodeMappingTable.txt' mapUuidToTcga = None mapTcgaToUuid = None class GDACOptionParser(OptionParser): """ An OptionParser subclass with customized usage info. """ def __init__(self, *args, **kwargs): OptionParser.__init__(self, *args, **kwargs) self.usage = "%prog [ID1 [ID2 [ID3 [...]]]]\n %prog [Options]" self.prolog = """%s: Sample ID translator for Broad GDAC data. Version 0.2.5 The TCGA currently employs several schemes for identifying samples, which can make collaboration difficult. The first, and still predominant scheme as of spring 2012, is barcodes (https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode); the second scheme, UUIDs, were introduced to make it possible to correct errors in sample labeling (e.g. a tumor miscategorized as a normal) without changing the primary key for identifying that sample within the many TCGA databases. A third scheme, similar to the original TCGA barcodes, was introduced by BCRs and is therefore colloquially referred to as the "BCR-friendly barcode". This program provides a convenient way to translate one (or many) sample ID(s) to another sample ID format. """ % self.get_prog_name() def print_help(self): print self.prolog OptionParser.print_help(self) def parseCommandLineArgs(): """ Parse all command line arguments. """ parser = GDACOptionParser() parser.add_option("-f", "--file", action="store", dest="file", default=None, help="read data from the specified file") parser.add_option("-p", "--pipe", action="store_true", dest="pipe", default=False, help="read data from stdin") opts, args = parser.parse_args() if (not opts.file) and (not opts.pipe) and (len(args) == 0): parser.print_help() sys.exit(1) return opts, args def loadIDs(opts, args): """ Load IDs from the appropriate source. """ if opts.file: idSource = opts.file elif opts.pipe: idSource = sys.stdin else: idSource = args return idSource def parseMappingTable(filename): """ Parse a UUID-to-Barcode mapping table. Generate a UUID-to-TCGA Barcode dict, and also a TCGA Barcode-to-UUID dict. """ rawfile = open(filename, 'rb') tsvfile = csv.reader(rawfile, delimiter='\t') tsvfile.next() # skip row of column names mapUuidToTcga = dict(tsvfile) mapTcgaToUuid = dict((v,k) for k,v in mapUuidToTcga.items()) return mapUuidToTcga, mapTcgaToUuid def mapid(idSource): """ Map a list of UUIDs to TCGA Barcodes, or vice versa. Valid inputs: * A list or tuple of IDs. * A filename, as a string. * A file with one ID per line. * An iterable that produces IDs. The mapping direction is inferred from the type of the first ID. It assumes homogeneous ID types: If the first item is a UUID, then it assumes that all items are are UUIDs. """ global mapUuidToTcga, mapTcgaToUuid # Load mapping tables (cached to global, if called multiple times) if mapUuidToTcga == None: mapUuidToTcga, mapTcgaToUuid = parseMappingTable(MAPTABLE) # Convert any ID input to an iterator. if type(idSource) is str: # For a filename: idFile = open(idSource, 'rb') idIter = imap(str.rstrip, idFile) elif type(idSource) is file: # For a file of IDs: idIter = imap(str.rstrip, idSource) # Lazily strip '\n' from each line. elif hasattr(idSource, '__iter__'): # For a list/tuple/iterator of IDs: idIter = imap(str.rstrip, idSource) else: print "Cannot iterate over '%s'." % idSource return None # Infer the intended mapping direction # from the type of the first list item. id0 = idIter.next() if id0 in mapUuidToTcga: idMap = mapUuidToTcga elif id0 in mapTcgaToUuid: idMap = mapTcgaToUuid else: print >> sys.stderr, "Mapping table does not contain ID: %s" % id0 return None # Return a list of mapped IDs. return [idMap[id0]] + map(idMap.get, idIter) if __name__ == "__main__": opts, args = parseCommandLineArgs() IDs = loadIDs(opts, args) try: mappedIDs = mapid(IDs) except IOError, ex: print ex sys.exit(1) if mappedIDs: for ID in mappedIDs: print ID