Changeset 6541


Ignore:
Timestamp:
Jan 17, 2022, 8:47:47 AM (21 months ago)
Author:
Nicklas Nordborg
Message:

References #1354: Search functionality for the OncoArray?-500K SNP chip

Major refactoring of the indexing functionality. The OncoArray data is now indexed but it is different from the "normal" indexing since we can't index each variant separately. For each raw bioassay we create 3 "documents". One document each for gt:0/0, gt:0/1 and gt:1/1 with a field snps that is a list of SNP-ID values. We also create a 4:th document with some summary information which is identical to the normal indexing.

Some functions for getting metadata (counts, etc.) have been moved to the subclasses since the queries that are needed are different. There are still some functions that need to be fixed.

Location:
extensions/net.sf.basedb.varsearch/trunk
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • extensions/net.sf.basedb.varsearch/trunk/resources/admin/manager.jsp

    r6167 r6541  
    2626{
    2727  display: grid;
    28   grid-template-columns: 2fr 1fr 1fr;
     28  grid-template-columns: 2fr 2fr 1fr;
    2929  width: 60em;
    3030  border-width: 1px;
  • extensions/net.sf.basedb.varsearch/trunk/src/net/sf/basedb/varsearch/LuceneColumnFactory.java

    r6530 r6541  
    233233      }
    234234     
    235       AllDocsCollector hits = getHits(item.getId(), maxHitsPerRba, new SortByChromAndPos());
     235      // TODO -- sorting doesn't work with the OncoArray index
     236      AllDocsCollector hits = getHits(item.getId(), maxHitsPerRba, null); //new SortByChromAndPos());
    236237     
    237238      NullSafeStringBuilder sb = new NullSafeStringBuilder();
     
    251252        if (indexAllGenotypes)
    252253        {
    253           if (gt == null && "./.".equals(gt))
     254          if (gt == null || "./.".equals(gt))
    254255          {
    255256            cls += " no-data";
  • extensions/net.sf.basedb.varsearch/trunk/src/net/sf/basedb/varsearch/index/LuceneIndex.java

    r6531 r6541  
    66import java.util.Collection;
    77import java.util.Collections;
    8 import java.util.HashMap;
    98import java.util.List;
    10 import java.util.Map;
    119import java.util.Set;
    1210import java.util.concurrent.CountDownLatch;
     
    2018
    2119import org.apache.lucene.analysis.Analyzer;
    22 import org.apache.lucene.analysis.core.KeywordAnalyzer;
    23 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
     20import org.apache.lucene.document.Document;
    2421import org.apache.lucene.document.IntPoint;
    2522import org.apache.lucene.document.LongPoint;
     
    5350import net.sf.basedb.core.query.IdListRestriction;
    5451import net.sf.basedb.util.FileUtil;
    55 import net.sf.basedb.varsearch.analyze.AlphaNumericIgnoreCaseAnalyzer;
    56 import net.sf.basedb.varsearch.analyze.EffectAnalyzer;
    57 import net.sf.basedb.varsearch.analyze.HgvsCdnaAnalyzer;
    58 import net.sf.basedb.varsearch.analyze.HgvsProtAnalyzer;
     52import net.sf.basedb.util.Values;
    5953import net.sf.basedb.varsearch.dao.Itemlist;
    60 import net.sf.basedb.varsearch.query.FieldAwareQueryParser;
    61 import net.sf.basedb.varsearch.query.FloatQueryField;
    62 import net.sf.basedb.varsearch.query.IntQueryField;
    63 import net.sf.basedb.varsearch.query.LongQueryField;
     54import net.sf.basedb.varsearch.query.AllDocsCollector;
    6455import net.sf.basedb.varsearch.query.QueryCache;
    65 import net.sf.basedb.varsearch.query.QueryField;
    6656import net.sf.basedb.varsearch.query.RawBioAssayIdCollector;
    67 import net.sf.basedb.varsearch.query.StripWildcardQueryField;
    6857import net.sf.basedb.varsearch.service.VarSearchService;
    6958
    7059/**
    71   Represents a Lucene index database.
     60  Represents a Lucene index database. Subclasses are required to implement
     61  some methods.
     62 
    7263  @author nicklas
    7364*/
    74 public class LuceneIndex
     65public abstract class LuceneIndex
    7566  implements Closeable
    7667{
     
    10192  private ExecutorService executor;
    10293  private Analyzer analyzer;
    103   private Map<String, QueryField> queryFields;
    10494  private QueryCache cache;
    10595 
     
    118108  }
    119109 
     110  /**
     111    Get the ID of the index.
     112  */
    120113  public String getId()
    121114  {
     
    247240    this.path = getExistingOrNewPath(pathPrefix);
    248241    this.directory = createIndexIfNeeded(path);
    249    
    250     // Create the Analyzer. Most fields are indexed literally with the
    251     // default KeywordAnalyzer.
    252     Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
    253     fieldAnalyzers.put("gene", new AlphaNumericIgnoreCaseAnalyzer()); // Gene names can be a list and we ignore case
    254     fieldAnalyzers.put("c", new HgvsCdnaAnalyzer()); // HGVS.c analyzer
    255     fieldAnalyzers.put("p", new HgvsProtAnalyzer()); // HGVS.p analyzer
    256     fieldAnalyzers.put("effect", new EffectAnalyzer()); // ANN.Annotation (=Effect) analyzer
    257     this.analyzer = new PerFieldAnalyzerWrapper(new KeywordAnalyzer(), fieldAnalyzers);
    258    
    259     queryFields = new HashMap<>();
    260     queryFields.put("pos", LongQueryField.INSTANCE);
    261     for (int i = 1; i < 23; i++)
    262     {
    263       queryFields.put("chr"+i, LongQueryField.INSTANCE);
    264     }
    265     queryFields.put("chrX", LongQueryField.INSTANCE);
    266     queryFields.put("chrY", LongQueryField.INSTANCE);
    267     queryFields.put("c", StripWildcardQueryField.INSTANCE);
    268     queryFields.put("p", StripWildcardQueryField.INSTANCE);
    269     queryFields.put("dp", IntQueryField.INSTANCE);
    270     queryFields.put("vd", IntQueryField.INSTANCE);
    271     queryFields.put("af", FloatQueryField.INSTANCE);
    272     queryFields.put("rbaId", IntQueryField.INSTANCE);
    273     queryFields.put("file", IntQueryField.INSTANCE);
    274    
    275     this.cache = new QueryCache(60); // 1 hour
    276     this.reader = DirectoryReader.open(directory);
    277     this.searcher = new IndexSearcher(reader, executor);
     242    this.reader = createIndexReader(directory);
     243    this.searcher = createIndexSearcher(reader);
     244    this.analyzer = createAnalyzer();
     245    this.cache = createQueryCache();
    278246    this.status = Status.IDLE;
    279247    this.autoUpdateAction = AutoUpdate.DEFAULT;
    280248  }
    281249 
     250  /**
     251    Create a reader for reading documents and information from
     252    the given index directory.
     253  */
     254  protected IndexReader createIndexReader(Directory directory)
     255    throws IOException
     256  {
     257    return DirectoryReader.open(directory);
     258  }
     259 
     260  /**
     261    Create a searcher for executing queries against the given index.
     262  */
     263  protected IndexSearcher createIndexSearcher(IndexReader reader)
     264  {
     265     return new IndexSearcher(reader, executor);
     266  }
     267
     268  /**
     269    Create an Analyzer implementation that is used for analyzing
     270    text that is going into the index. The analyzer is also used for
     271    parsing query strings.
     272  */
     273  protected abstract Analyzer createAnalyzer();
     274 
     275  /**
     276    Create a cache for storing results from queries that takes
     277    a long time to execute.
     278  */
     279  protected QueryCache createQueryCache()
     280  {
     281    // TODO -- add support for implementations that return null
     282    // and implement a setting for what is considered a long time
     283    return new QueryCache(60); // 1 hour
     284  }
     285
    282286  @Override
    283287  public void close()
     
    443447      config.setMergePolicy(mergePolicy);
    444448     
    445       writer = new IndexWriter(directory, config);
    446      
     449      writer = new IndexWriter(directory, config);     
    447450      processed = addToIndex(dc, rawBioAssays, writer);
    448451
     
    454457      logger.debug("Commit completed ("+name+")");
    455458      progress.display(99, "Commit complete. Cleaning up...");
     459     
    456460      rwLock.writeLock().lock();
    457461      try
    458462      {
    459         cache = new QueryCache(60);
    460         reader = DirectoryReader.open(directory);
    461         searcher = new IndexSearcher(reader);
     463        cache = createQueryCache();
     464        reader = createIndexReader(directory);
     465        searcher = createIndexSearcher(reader);
    462466      }
    463467      finally
     
    535539      ThreadFactory threadFactory = new IndexThreadFactory(id);
    536540      threadPool = Executors.newFixedThreadPool(numThreads, threadFactory);
    537       ExecutorCompletionService<RawBioAssayIndexer> executor = new ExecutorCompletionService<>(threadPool);
     541      ExecutorCompletionService<Indexer> executor = new ExecutorCompletionService<>(threadPool);
    538542 
    539543      numRba = 0;
    540       int allRba = rawBioAssays.size();
     544      int allRba = rawBioAssays.size();     
    541545      for (RawBioAssay rba : rawBioAssays)
    542546      {
     
    545549        {
    546550          numRba++;
    547           executor.submit(new RawBioAssayIndexer(this, writer, numRba, rba, vcfFiles));
     551          executor.submit(createIndexer(writer, numRba, rba, vcfFiles));
    548552          if (numRba % 100 == 0)
    549553          {
     
    564568      int numFailed = 0;
    565569      int numAborted = 0;
    566  
     570
    567571      for (int i = 0; i < numRba; i++)
    568572      {
    569         Future<RawBioAssayIndexer> result = executor.take();
     573        Future<Indexer> result = executor.take();
    570574        dc.getSessionControl().updateLastAccess(); // To avoid session timeout
    571575        if (result != null && !result.isCancelled())
     
    573577          try
    574578          {
    575             RawBioAssayIndexer indexer = result.get();
     579            Indexer indexer = result.get();
    576580            if (!indexer.wasAborted())
    577581            {
     
    607611 
    608612  /**
     613    Create an Indexer implementation that knows how to index the information
     614    in the given raw bioassay.
     615  */
     616  protected abstract Indexer createIndexer(IndexWriter writer, int num, RawBioAssay rba, List<VcfFile> vcfFiles);
     617 
     618  /**
    609619    Remove RawBioassays from the index.
    610620  */
     
    629639      {
    630640        // delete existing information about this raw bioassay id
     641        // TODO -- maybe this should also be in the subclasses
    631642        writer.deleteDocuments(IntPoint.newExactQuery("rbaId", rbaId));
    632643        writer.deleteDocuments(IntPoint.newExactQuery("mainId", rbaId));
     
    649660        try
    650661        {
    651           cache = new QueryCache(60);
    652           reader = DirectoryReader.open(directory);
    653           searcher = new IndexSearcher(reader);
     662          reader = createIndexReader(directory);
     663          searcher = createIndexSearcher(reader);
     664          cache = createQueryCache();
    654665        }
    655666        finally
     
    714725        path = newPath;
    715726        directory = new NIOFSDirectory(path.toPath());
    716         cache = new QueryCache(60);
    717         reader = DirectoryReader.open(directory);
    718         searcher = new IndexSearcher(reader);
     727        reader = createIndexReader(directory);
     728        searcher = createIndexSearcher(reader);
     729        cache = createQueryCache();
    719730      }
    720731      finally
     
    831842        path = rebuildPath;
    832843        directory = new NIOFSDirectory(path.toPath());
    833         cache = new QueryCache(60);
    834         reader = DirectoryReader.open(directory);
    835         searcher = new IndexSearcher(reader);
     844        reader = createIndexReader(directory);
     845        searcher = createIndexSearcher(reader);
     846        cache = createQueryCache();
    836847      }
    837848      finally
     
    912923  }
    913924 
    914   private Integer countGenotypes(RawBioAssay rba, boolean allGenotypes)
     925  protected Integer countGenotypes(RawBioAssay rba, boolean allGenotypes)
    915926    throws IOException
    916927  {
    917928    if (status == Status.DISABLED) return null;
    918929
    919     Query query = IntPoint.newExactQuery("mainId", rba.getId());
    920     if (getIndexSearcher().count(query) == 0) return null;
    921    
    922     query = IntPoint.newExactQuery("rbaId", rba.getId());
    923     if (indexAllGenotypes && !allGenotypes)
    924     {
    925       // We need to exclude 0/0 genotype from the result
    926       BooleanQuery.Builder b = new BooleanQuery.Builder();
    927       b.add(query, Occur.MUST);
    928       b.add(new TermQuery(new Term("gt", "0/0")), Occur.MUST_NOT);
    929       query = b.build();
    930     }
    931     return getIndexSearcher().count(query);
    932   }
     930    Query query = IntPoint.newExactQuery("mainId", rba.getId());   
     931   
     932    AllDocsCollector hits = new AllDocsCollector(getIndexReader(), 1, null);
     933    getIndexSearcher().search(query, hits);
     934    if (hits.getTotalHits() == 0) return null;
     935   
     936    Document doc = hits.getDocuments().iterator().next();
     937    return Values.getInt(doc.get(allGenotypes ? "numGenotypes" : "numVariants"));
     938  }
     939
    933940 
    934941  /**
     
    990997    Create a new parser for creating queries from strings.
    991998  */
    992   public QueryParser createQueryParser()
    993   {
    994     return new FieldAwareQueryParser("gene", analyzer, queryFields);
    995   }
     999  public abstract QueryParser createQueryParser();
    9961000
    9971001  /**
     
    10071011    @return The number of variants or -1 if the index is not open
    10081012  */
    1009   public int getNumVariants()
    1010   {
    1011     if (status == Status.DISABLED) return -1;
    1012     return reader.numDocs();
    1013   }
     1013  public abstract long getNumVariants()
     1014    throws IOException;
    10141015 
    10151016  /**
     
    10401041  /**
    10411042    Get the id of all raw bioassays where the specified variant has been found.
     1043    TODO -- this need to be subclassed
    10421044  */
    10431045  public Set<Integer> getRawBioAssaysWithVariant(String chrom, long pos, String ref, String alt)
  • extensions/net.sf.basedb.varsearch/trunk/src/net/sf/basedb/varsearch/query/LuceneQueryFactory.java

    r6524 r6541  
    244244          queries.add(query);
    245245        }
    246        
     246
    247247        logger.debug("Index: " + idx.getName());
    248248        logger.debug("QueryString: " + queryString);
  • extensions/net.sf.basedb.varsearch/trunk/src/net/sf/basedb/varsearch/service/VarSearchService.java

    r6525 r6541  
    3535import net.sf.basedb.varsearch.index.DatafileVcfLocator;
    3636import net.sf.basedb.varsearch.index.LuceneIndex;
     37import net.sf.basedb.varsearch.index.VariantCallIndex;
    3738import net.sf.basedb.varsearch.index.LuceneIndex.AutoUpdate;
    3839import net.sf.basedb.varsearch.index.LuceneIndex.FullRebuildRunnable;
    3940import net.sf.basedb.varsearch.index.LuceneIndex.QueryThreadFactory;
    4041import net.sf.basedb.varsearch.index.LuceneIndex.Status;
     42import net.sf.basedb.varsearch.index.OncoArrayIndex;
    4143
    4244public class VarSearchService
     
    157159      try
    158160      {
    159         LuceneIndex filtered = new LuceneIndex("filtered");
     161        LuceneIndex filtered = new VariantCallIndex("filtered");
    160162        filtered.setName("Variants (filtered)");
    161163        filtered.setItemList(Itemlist.VARIANT_INDEX_FILTERED);
     
    165167        indexes.put(filtered.getId(), filtered);
    166168       
    167         LuceneIndex raw = new LuceneIndex("raw");
     169        LuceneIndex raw = new VariantCallIndex("raw");
    168170        raw.setName("Variants (all)");
    169171        raw.setItemList(Itemlist.VARIANT_INDEX_ALL);
     
    174176        indexes.put(raw.getId(), raw);
    175177       
    176         LuceneIndex targeted = new LuceneIndex("targeted");
     178        LuceneIndex targeted = new VariantCallIndex("targeted");
    177179        targeted.setName("Variants (targeted)");
    178180        targeted.setItemList(Itemlist.VARIANT_INDEX_TARGETED);
     
    184186        targeted.open(new File(dbDir, "targeted"), null);
    185187        indexes.put(targeted.getId(), targeted);
    186 
     188       
     189        LuceneIndex oncoArray500K = new OncoArrayIndex("oncoarray-500K");
     190        oncoArray500K.setName("Genotyping (OncoArray500K)");
     191        oncoArray500K.setItemList(Itemlist.GENOTYPING_ONCOARRAY_500K);
     192        oncoArray500K.setIndexAllGenotypes(true);
     193        oncoArray500K.setViewAllVariantsEnabled(false);
     194        oncoArray500K.setVcfFileLocator(new DatafileVcfLocator(Datafiletype.VCF, false));
     195        oncoArray500K.open(new File(dbDir, "oncoarray-500k"), null);
     196        indexes.put(oncoArray500K.getId(), oncoArray500K);
    187197      }
    188198      catch (IOException ex)
Note: See TracChangeset for help on using the changeset viewer.