| 1 | package org.intermine.bio.dataconversion; |
|---|
| 2 | |
|---|
| 3 | /* |
|---|
| 4 | * Copyright (C) 2002-2009 FlyMine |
|---|
| 5 | * |
|---|
| 6 | * This code may be freely distributed and modified under the |
|---|
| 7 | * terms of the GNU Lesser General Public Licence. This should |
|---|
| 8 | * be distributed with the code. See the LICENSE file for more |
|---|
| 9 | * information or http://www.gnu.org/copyleft/lesser.html. |
|---|
| 10 | * |
|---|
| 11 | */ |
|---|
| 12 | |
|---|
| 13 | import java.io.File; |
|---|
| 14 | import java.io.IOException; |
|---|
| 15 | import java.io.Reader; |
|---|
| 16 | import java.util.HashMap; |
|---|
| 17 | import java.util.Iterator; |
|---|
| 18 | |
|---|
| 19 | import org.apache.commons.lang.StringUtils; |
|---|
| 20 | import org.apache.log4j.Logger; |
|---|
| 21 | import org.intermine.dataconversion.ItemWriter; |
|---|
| 22 | import org.intermine.metadata.Model; |
|---|
| 23 | import org.intermine.objectstore.ObjectStoreException; |
|---|
| 24 | import org.intermine.util.FormattedTextParser; |
|---|
| 25 | import org.intermine.xml.full.Item; |
|---|
| 26 | |
|---|
| 27 | |
|---|
| 28 | /** |
|---|
| 29 | * DataConverter to load Kegg Pathways and link them to Genes |
|---|
| 30 | * |
|---|
| 31 | * @author Richard Smith |
|---|
| 32 | */ |
|---|
| 33 | public class KeggExampleConverter extends BioFileConverter |
|---|
| 34 | { |
|---|
| 35 | protected static final Logger LOG = Logger.getLogger(KeggExampleConverter.class); |
|---|
| 36 | |
|---|
| 37 | protected HashMap<String, Item> pathwayMap = new HashMap<String, Item>(); |
|---|
| 38 | private String taxonId = null; |
|---|
| 39 | private Item organism = null; |
|---|
| 40 | |
|---|
| 41 | /** |
|---|
| 42 | * Constructor |
|---|
| 43 | * @param writer the ItemWriter used to handle the resultant items |
|---|
| 44 | * @param model the Model |
|---|
| 45 | */ |
|---|
| 46 | public KeggExampleConverter(ItemWriter writer, Model model) { |
|---|
| 47 | super(writer, model, "GenomeNet", "KEGG PATHWAY"); |
|---|
| 48 | } |
|---|
| 49 | |
|---|
| 50 | /** |
|---|
| 51 | * Set the taxon id to process. |
|---|
| 52 | * @param taxonId the id |
|---|
| 53 | */ |
|---|
| 54 | public void setTaxonId(String taxonId) { |
|---|
| 55 | this.taxonId = taxonId; |
|---|
| 56 | } |
|---|
| 57 | |
|---|
| 58 | /** |
|---|
| 59 | * Called for each file found by ant. |
|---|
| 60 | * |
|---|
| 61 | * {@inheritDoc} |
|---|
| 62 | */ |
|---|
| 63 | public void process(Reader reader) throws Exception { |
|---|
| 64 | if (StringUtils.isEmpty(taxonId)) { |
|---|
| 65 | throw new IllegalArgumentException("No taxonId provided: " + taxonId); |
|---|
| 66 | } |
|---|
| 67 | |
|---|
| 68 | // There are two files: |
|---|
| 69 | // map_title.tab - pathway ids and their names |
|---|
| 70 | // xxx_gene_map.tab - genes and the pathways they are involved in |
|---|
| 71 | // The following code works out which file we are reading and calls the corresponding method |
|---|
| 72 | File currentFile = getCurrentFile(); |
|---|
| 73 | |
|---|
| 74 | if (currentFile.getName().equals("map_title.tab")) { |
|---|
| 75 | processMapTitleFile(reader); |
|---|
| 76 | } else if (currentFile.getName().endsWith("gene_map.tab")) { |
|---|
| 77 | processGeneMapFile(reader); |
|---|
| 78 | } else { |
|---|
| 79 | throw new IllegalArgumentException("Unexpected file: " + currentFile.getName()); |
|---|
| 80 | } |
|---|
| 81 | } |
|---|
| 82 | |
|---|
| 83 | |
|---|
| 84 | /** |
|---|
| 85 | * Process all rows of the map_title.tab file |
|---|
| 86 | * @param reader a reader for the map_title.tab file |
|---|
| 87 | * @throws IOException |
|---|
| 88 | * @throws ObjectStoreException |
|---|
| 89 | */ |
|---|
| 90 | private void processMapTitleFile(Reader reader) throws IOException, ObjectStoreException { |
|---|
| 91 | Iterator lineIter = FormattedTextParser.parseTabDelimitedReader(reader); |
|---|
| 92 | |
|---|
| 93 | // this file has data of the format: |
|---|
| 94 | // pathway id | pathway name |
|---|
| 95 | while (lineIter.hasNext()) { |
|---|
| 96 | // line is a string array with the one element for each tab separated value |
|---|
| 97 | // on the next line of the file |
|---|
| 98 | String[] line = (String[]) lineIter.next(); |
|---|
| 99 | |
|---|
| 100 | String pathwayId = line [0]; |
|---|
| 101 | String pathwayName = line[1]; |
|---|
| 102 | |
|---|
| 103 | // getPathway will create an Item or fetch it from a map if seen before |
|---|
| 104 | Item pathway = getPathway(pathwayId); |
|---|
| 105 | pathway.setAttribute("name", pathwayName); |
|---|
| 106 | |
|---|
| 107 | // once we have set the pathway name that is all the information needed so we can store |
|---|
| 108 | store(pathway); |
|---|
| 109 | } |
|---|
| 110 | } |
|---|
| 111 | |
|---|
| 112 | /** |
|---|
| 113 | * Process all rows of the xxx_gene_map.tab file |
|---|
| 114 | * @param reader a reader for the xxx_gene_map.tab file |
|---|
| 115 | * @throws IOException |
|---|
| 116 | * @throws ObjectStoreException |
|---|
| 117 | */ |
|---|
| 118 | private void processGeneMapFile(Reader reader) throws IOException, ObjectStoreException { |
|---|
| 119 | // this file has data of the format: |
|---|
| 120 | // gene id | pathway ids (space separated) |
|---|
| 121 | |
|---|
| 122 | Iterator lineIter = FormattedTextParser.parseTabDelimitedReader(reader); |
|---|
| 123 | |
|---|
| 124 | while (lineIter.hasNext()) { |
|---|
| 125 | // line is a string array with the one element for each tab separated value |
|---|
| 126 | // on the next line of the file |
|---|
| 127 | String[] line = (String[]) lineIter.next(); |
|---|
| 128 | |
|---|
| 129 | String geneId = line[0]; |
|---|
| 130 | |
|---|
| 131 | // create a gene with this id as primaryIdentifier |
|---|
| 132 | Item gene = createItem("Gene"); |
|---|
| 133 | gene.setAttribute("primaryIdentifier", geneId); |
|---|
| 134 | gene.setReference("organism", getOrganism()); |
|---|
| 135 | |
|---|
| 136 | // split the space separated list of pathway ids |
|---|
| 137 | String[] pathwayIds = line[1].split(" "); |
|---|
| 138 | |
|---|
| 139 | // add each pathway to the Gene.pathways collection |
|---|
| 140 | for (String pathwayId : pathwayIds) { |
|---|
| 141 | // getPathway() will create a new pathway or fetch it from a map if already seen |
|---|
| 142 | Item pathway = getPathway(pathwayId); |
|---|
| 143 | gene.addToCollection("pathways", pathway); |
|---|
| 144 | } |
|---|
| 145 | |
|---|
| 146 | // we have finished with this gene now so can store it |
|---|
| 147 | store(gene); |
|---|
| 148 | } |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | /** |
|---|
| 152 | * Create a new pathway Item or fetch from a map if it has been seen before |
|---|
| 153 | * @param pathwayId the id of a KEGG pathway to look up |
|---|
| 154 | * @return an Item representing the pathway |
|---|
| 155 | */ |
|---|
| 156 | private Item getPathway(String pathwayId) { |
|---|
| 157 | Item pathway = pathwayMap.get(pathwayId); |
|---|
| 158 | if (pathway == null) { |
|---|
| 159 | pathway = createItem("Pathway"); |
|---|
| 160 | pathway.setAttribute("identifier", pathwayId); |
|---|
| 161 | pathwayMap.put(pathwayId, pathway); |
|---|
| 162 | } |
|---|
| 163 | return pathway; |
|---|
| 164 | } |
|---|
| 165 | |
|---|
| 166 | /** |
|---|
| 167 | * Get an Item representing an organism, create and store it if called for the first time |
|---|
| 168 | * @return an Item representing the organism |
|---|
| 169 | * @throws ObjectStoreException |
|---|
| 170 | */ |
|---|
| 171 | private Item getOrganism() throws ObjectStoreException { |
|---|
| 172 | if (organism == null) { |
|---|
| 173 | organism = createItem("Organism"); |
|---|
| 174 | organism.setAttribute("taxonId", taxonId); |
|---|
| 175 | store(organism); |
|---|
| 176 | } |
|---|
| 177 | return organism; |
|---|
| 178 | } |
|---|
| 179 | } |
|---|