class Redwood::XapianIndex

This index implementation uses Xapian for searching and storage. It tends to be slightly faster than Ferret for indexing and significantly faster for searching due to precomputing thread membership.

Constants

BOOLEAN_PREFIX

Unstemmed

DATE_VALUENO
DOCID_SCALE

Xapian can very efficiently sort in ascending docid order. Sup always wants to sort by descending date, so this method maps between them. In order to handle multiple messages per second, we use a logistic curve centered around MIDDLE_DATE so that the slope (docid/s) is greatest in this time period. A docid collision is not an error - the code will pick the next smallest unused one.

EACH_ID_PAGE
INDEX_VERSION
MAX_DATE
MAX_TERM_LENGTH
MIDDLE_DATE
MIN_DATE

dates are converted to integers for xapian, and are used for document ids, so we must ensure they're reasonably valid. this typically only affect spam.

MSGID_VALUENO
NORMAL_PREFIX

Stemmed

PREFIX
Q
STEM_LANGUAGE
THREAD_VALUENO
TIME_SCALE

Public Class Methods

new(dir=BASE_DIR) click to toggle source
# File lib/sup/xapian_index.rb, line 28
def initialize dir=BASE_DIR
  super

  @index_mutex = Monitor.new
end

Public Instance Methods

add_message(m;) click to toggle source
# File lib/sup/xapian_index.rb, line 96
def add_message m; sync_message m, true end
build_message(id) click to toggle source
# File lib/sup/xapian_index.rb, line 76
def build_message id
  entry = synchronize { get_entry id }
  return unless entry

  source = SourceManager[entry[:source_id]]
  raise "invalid source #{entry[:source_id]}" unless source

  m = Message.new :source => source, :source_info => entry[:source_info],
                  :labels => entry[:labels], :snippet => entry[:snippet]

  mk_person = lambda { |x| Person.new(*x.reverse!) }
  entry[:from] = mk_person[entry[:from]]
  entry[:to].map!(&mk_person)
  entry[:cc].map!(&mk_person)
  entry[:bcc].map!(&mk_person)

  m.load_from_index! entry
  m
end
contains_id?(id) click to toggle source
# File lib/sup/xapian_index.rb, line 64
def contains_id? id
  synchronize { find_docid(id) && true }
end
delete(id) click to toggle source
# File lib/sup/xapian_index.rb, line 72
def delete id
  synchronize { @xapian.delete_document mkterm(:msgid, id) }
end
each_id(query={}) { |id| ... } click to toggle source
# File lib/sup/xapian_index.rb, line 107
def each_id query={}
  offset = 0
  page = EACH_ID_PAGE

  xapian_query = build_xapian_query query
  while true
    ids = run_query_ids xapian_query, offset, (offset+page)
    ids.each { |id| yield id }
    break if ids.size < page
    offset += page
  end
end
each_id_by_date(query={}) { |id, lambda { build_message id }| ... } click to toggle source
# File lib/sup/xapian_index.rb, line 120
def each_id_by_date query={}
  each_id(query) { |id| yield id, lambda { build_message id } }
end
each_message_in_thread_for(m, opts={}) { |id, lambda { build_message id }| ... } click to toggle source
# File lib/sup/xapian_index.rb, line 124
def each_message_in_thread_for m, opts={}
  # TODO thread by subject
  return unless doc = find_doc(m.id)
  queue = doc.value(THREAD_VALUENO).split(',')
  msgids = [m.id]
  seen_threads = Set.new
  seen_messages = Set.new [m.id]
  while not queue.empty?
    thread_id = queue.pop
    next if seen_threads.member? thread_id
    return false if opts[:skip_killed] && thread_killed?(thread_id)
    seen_threads << thread_id
    docs = term_docids(mkterm(:thread, thread_id)).map { |x| @xapian.document x }
    docs.each do |doc|
      msgid = doc.value MSGID_VALUENO
      next if seen_messages.member? msgid
      msgids << msgid
      seen_messages << msgid
      queue.concat doc.value(THREAD_VALUENO).split(',')
    end
  end
  msgids.each { |id| yield id, lambda { build_message id } }
  true
end
load_contacts(emails, opts={}) click to toggle source
# File lib/sup/xapian_index.rb, line 149
def load_contacts emails, opts={}
  contacts = Set.new
  num = opts[:num] || 20
  each_id_by_date :participants => emails do |id,b|
    break if contacts.size >= num
    m = b.call
    ([m.from]+m.to+m.cc+m.bcc).compact.each { |p| contacts << [p.name, p.email] }
  end
  contacts.to_a.compact.map { |n,e| Person.new n, e }[0...num]
end
load_index() click to toggle source
# File lib/sup/xapian_index.rb, line 34
def load_index
  path = File.join(@dir, 'xapian')
  if File.exists? path
    @xapian = Xapian::WritableDatabase.new(path, Xapian::DB_OPEN)
    db_version = @xapian.get_metadata 'version'
    db_version = '0' if db_version.empty?
    if db_version != INDEX_VERSION
      fail "This Sup version expects a v#{INDEX_VERSION} index, but you have an existing v#{db_version} index. Please downgrade to your previous version and dump your labels before upgrading to this version (then run sup-sync --restore)."
    end
  else
    @xapian = Xapian::WritableDatabase.new(path, Xapian::DB_CREATE)
    @xapian.set_metadata 'version', INDEX_VERSION
  end
  @enquire = Xapian::Enquire.new @xapian
  @enquire.weighting_scheme = Xapian::BoolWeight.new
  @enquire.docid_order = Xapian::Enquire::ASCENDING
end
num_results_for(query={}) click to toggle source
# File lib/sup/xapian_index.rb, line 100
def num_results_for query={}
  xapian_query = build_xapian_query query
  matchset = run_query xapian_query, 0, 0, 100
  matchset.matches_estimated
end
optimize() click to toggle source
# File lib/sup/xapian_index.rb, line 57
def optimize
end
parse_query(s) click to toggle source

TODO share code with the Ferret index

# File lib/sup/xapian_index.rb, line 161
def parse_query s
  query = {}

  subs = HookManager.run("custom-search", :subs => s) || s
  subs = subs.gsub(%r\b(to|from):(\S+)\b/) do
    field, value = $1, $2
    email_field, name_field = %w(email name).map { |x| "#{field}_#{x}" }
    if(p = ContactManager.contact_for(value))
      "#{email_field}:#{p.email}"
    elsif value == "me"
      '(' + AccountManager.user_emails.map { |e| "#{email_field}:#{e}" }.join(' OR ') + ')'
    else
      "(#{email_field}:#{value} OR #{name_field}:#{value})"
    end
  end

  ## if we see a label:deleted or a label:spam term anywhere in the query
  ## string, we set the extra load_spam or load_deleted options to true.
  ## bizarre? well, because the query allows arbitrary parenthesized boolean
  ## expressions, without fully parsing the query, we can't tell whether
  ## the user is explicitly directing us to search spam messages or not.
  ## e.g. if the string is -(-(-(-(-label:spam)))), does the user want to
  ## search spam messages or not?
  ##
  ## so, we rely on the fact that turning these extra options ON turns OFF
  ## the adding of "-label:deleted" or "-label:spam" terms at the very
  ## final stage of query processing. if the user wants to search spam
  ## messages, not adding that is the right thing; if he doesn't want to
  ## search spam messages, then not adding it won't have any effect.
  query[:load_spam] = true if subs =~ %r\blabel:spam\b/
  query[:load_deleted] = true if subs =~ %r\blabel:deleted\b/

  ## gmail style "is" operator
  subs = subs.gsub(%r\b(is|has):(\S+)\b/) do
    field, label = $1, $2
    case label
    when "read"
      "-label:unread"
    when "spam"
      query[:load_spam] = true
      "label:spam"
    when "deleted"
      query[:load_deleted] = true
      "label:deleted"
    else
      "label:#{$2}"
    end
  end

  ## gmail style attachments "filename" and "filetype" searches
  subs = subs.gsub(%r\b(filename|filetype):(\((.+?)\)\B|(\S+)\b)/) do
    field, name = $1, ($3 || $4)
    case field
    when "filename"
      debug "filename: translated #{field}:#{name} to attachment:\"#{name.downcase}\""
      "attachment:\"#{name.downcase}\""
    when "filetype"
      debug "filetype: translated #{field}:#{name} to attachment_extension:#{name.downcase}"
      "attachment_extension:#{name.downcase}"
    end
  end

  if $have_chronic
    lastdate = 2<<32 - 1
    firstdate = 0
    subs = subs.gsub(%r\b(before|on|in|during|after):(\((.+?)\)\B|(\S+)\b)/) do
      field, datestr = $1, ($3 || $4)
      realdate = Chronic.parse datestr, :guess => false, :context => :past
      if realdate
        case field
        when "after"
          debug "chronic: translated #{field}:#{datestr} to #{realdate.end}"
          "date:#{realdate.end.to_i}..#{lastdate}"
        when "before"
          debug "chronic: translated #{field}:#{datestr} to #{realdate.begin}"
          "date:#{firstdate}..#{realdate.end.to_i}"
        else
          debug "chronic: translated #{field}:#{datestr} to #{realdate}"
          "date:#{realdate.begin.to_i}..#{realdate.end.to_i}"
        end
      else
        raise ParseError, "can't understand date #{datestr.inspect}"
      end
    end
  end

  ## limit:42 restrict the search to 42 results
  subs = subs.gsub(%r\blimit:(\S+)\b/) do
    lim = $1
    if lim =~ %r^\d+$/
      query[:limit] = lim.to_i
      ''
    else
      raise ParseError, "non-numeric limit #{lim.inspect}"
    end
  end

  debug "translated query: #{subs.inspect}"

  qp = Xapian::QueryParser.new
  qp.database = @xapian
  qp.stemmer = Xapian::Stem.new(STEM_LANGUAGE)
  qp.stemming_strategy = Xapian::QueryParser::STEM_SOME
  qp.default_op = Xapian::Query::OP_AND
  qp.add_valuerangeprocessor(Xapian::NumberValueRangeProcessor.new(DATE_VALUENO, 'date:', true))
  NORMAL_PREFIX.each { |k,v| qp.add_prefix k, v }
  BOOLEAN_PREFIX.each { |k,v| qp.add_boolean_prefix k, v }
  xapian_query = qp.parse_query(subs, Xapian::QueryParser::FLAG_PHRASE|Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_WILDCARD, PREFIX['body'])

  debug "parsed xapian query: #{xapian_query.description}"

  raise ParseError if xapian_query.nil? or xapian_query.empty?
  query[:qobj] = xapian_query
  query[:text] = s
  query
end
save_index() click to toggle source
# File lib/sup/xapian_index.rb, line 52
def save_index
  info "Flushing Xapian updates to disk. This may take a while..."
  @xapian.flush
end
size() click to toggle source
# File lib/sup/xapian_index.rb, line 60
def size
  synchronize { @xapian.doccount }
end
source_for_id(id) click to toggle source
# File lib/sup/xapian_index.rb, line 68
def source_for_id id
  synchronize { get_entry(id)[:source_id] }
end
update_message(m;) click to toggle source
# File lib/sup/xapian_index.rb, line 97
def update_message m; sync_message m, true end
update_message_state(m;) click to toggle source
# File lib/sup/xapian_index.rb, line 98
def update_message_state m; sync_message m, false end