search.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2015, 2016 OpenMarket Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from twisted.internet import defer
  16. from .background_updates import BackgroundUpdateStore
  17. from synapse.api.errors import SynapseError
  18. from synapse.storage.engines import PostgresEngine, Sqlite3Engine
  19. import logging
  20. import re
  21. import ujson as json
  22. logger = logging.getLogger(__name__)
  23. class SearchStore(BackgroundUpdateStore):
  24. EVENT_SEARCH_UPDATE_NAME = "event_search"
  25. EVENT_SEARCH_ORDER_UPDATE_NAME = "event_search_order"
  26. EVENT_SEARCH_USE_GIST_POSTGRES_NAME = "event_search_postgres_gist"
  27. def __init__(self, hs):
  28. super(SearchStore, self).__init__(hs)
  29. self.register_background_update_handler(
  30. self.EVENT_SEARCH_UPDATE_NAME, self._background_reindex_search
  31. )
  32. self.register_background_update_handler(
  33. self.EVENT_SEARCH_ORDER_UPDATE_NAME,
  34. self._background_reindex_search_order
  35. )
  36. self.register_background_update_handler(
  37. self.EVENT_SEARCH_USE_GIST_POSTGRES_NAME,
  38. self._background_reindex_gist_search
  39. )
  40. @defer.inlineCallbacks
  41. def _background_reindex_search(self, progress, batch_size):
  42. target_min_stream_id = progress["target_min_stream_id_inclusive"]
  43. max_stream_id = progress["max_stream_id_exclusive"]
  44. rows_inserted = progress.get("rows_inserted", 0)
  45. INSERT_CLUMP_SIZE = 1000
  46. TYPES = ["m.room.name", "m.room.message", "m.room.topic"]
  47. def reindex_search_txn(txn):
  48. sql = (
  49. "SELECT stream_ordering, event_id, room_id, type, content FROM events"
  50. " WHERE ? <= stream_ordering AND stream_ordering < ?"
  51. " AND (%s)"
  52. " ORDER BY stream_ordering DESC"
  53. " LIMIT ?"
  54. ) % (" OR ".join("type = '%s'" % (t,) for t in TYPES),)
  55. txn.execute(sql, (target_min_stream_id, max_stream_id, batch_size))
  56. rows = self.cursor_to_dict(txn)
  57. if not rows:
  58. return 0
  59. min_stream_id = rows[-1]["stream_ordering"]
  60. event_search_rows = []
  61. for row in rows:
  62. try:
  63. event_id = row["event_id"]
  64. room_id = row["room_id"]
  65. etype = row["type"]
  66. try:
  67. content = json.loads(row["content"])
  68. except:
  69. continue
  70. if etype == "m.room.message":
  71. key = "content.body"
  72. value = content["body"]
  73. elif etype == "m.room.topic":
  74. key = "content.topic"
  75. value = content["topic"]
  76. elif etype == "m.room.name":
  77. key = "content.name"
  78. value = content["name"]
  79. except (KeyError, AttributeError):
  80. # If the event is missing a necessary field then
  81. # skip over it.
  82. continue
  83. if not isinstance(value, basestring):
  84. # If the event body, name or topic isn't a string
  85. # then skip over it
  86. continue
  87. event_search_rows.append((event_id, room_id, key, value))
  88. if isinstance(self.database_engine, PostgresEngine):
  89. sql = (
  90. "INSERT INTO event_search (event_id, room_id, key, vector)"
  91. " VALUES (?,?,?,to_tsvector('english', ?))"
  92. )
  93. elif isinstance(self.database_engine, Sqlite3Engine):
  94. sql = (
  95. "INSERT INTO event_search (event_id, room_id, key, value)"
  96. " VALUES (?,?,?,?)"
  97. )
  98. else:
  99. # This should be unreachable.
  100. raise Exception("Unrecognized database engine")
  101. for index in range(0, len(event_search_rows), INSERT_CLUMP_SIZE):
  102. clump = event_search_rows[index:index + INSERT_CLUMP_SIZE]
  103. txn.executemany(sql, clump)
  104. progress = {
  105. "target_min_stream_id_inclusive": target_min_stream_id,
  106. "max_stream_id_exclusive": min_stream_id,
  107. "rows_inserted": rows_inserted + len(event_search_rows)
  108. }
  109. self._background_update_progress_txn(
  110. txn, self.EVENT_SEARCH_UPDATE_NAME, progress
  111. )
  112. return len(event_search_rows)
  113. result = yield self.runInteraction(
  114. self.EVENT_SEARCH_UPDATE_NAME, reindex_search_txn
  115. )
  116. if not result:
  117. yield self._end_background_update(self.EVENT_SEARCH_UPDATE_NAME)
  118. defer.returnValue(result)
  119. @defer.inlineCallbacks
  120. def _background_reindex_gist_search(self, progress, batch_size):
  121. def create_index(conn):
  122. conn.rollback()
  123. conn.set_session(autocommit=True)
  124. c = conn.cursor()
  125. c.execute(
  126. "CREATE INDEX CONCURRENTLY event_search_fts_idx_gist"
  127. " ON event_search USING GIST (vector)"
  128. )
  129. c.execute("DROP INDEX event_search_fts_idx")
  130. conn.set_session(autocommit=False)
  131. if isinstance(self.database_engine, PostgresEngine):
  132. yield self.runWithConnection(create_index)
  133. yield self._end_background_update(self.EVENT_SEARCH_USE_GIST_POSTGRES_NAME)
  134. defer.returnValue(1)
  135. @defer.inlineCallbacks
  136. def _background_reindex_search_order(self, progress, batch_size):
  137. target_min_stream_id = progress["target_min_stream_id_inclusive"]
  138. max_stream_id = progress["max_stream_id_exclusive"]
  139. rows_inserted = progress.get("rows_inserted", 0)
  140. have_added_index = progress['have_added_indexes']
  141. if not have_added_index:
  142. def create_index(conn):
  143. conn.rollback()
  144. conn.set_session(autocommit=True)
  145. c = conn.cursor()
  146. # We create with NULLS FIRST so that when we search *backwards*
  147. # we get the ones with non null origin_server_ts *first*
  148. c.execute(
  149. "CREATE INDEX CONCURRENTLY event_search_room_order ON event_search("
  150. "room_id, origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)"
  151. )
  152. c.execute(
  153. "CREATE INDEX CONCURRENTLY event_search_order ON event_search("
  154. "origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)"
  155. )
  156. conn.set_session(autocommit=False)
  157. yield self.runWithConnection(create_index)
  158. pg = dict(progress)
  159. pg["have_added_indexes"] = True
  160. yield self.runInteraction(
  161. self.EVENT_SEARCH_ORDER_UPDATE_NAME,
  162. self._background_update_progress_txn,
  163. self.EVENT_SEARCH_ORDER_UPDATE_NAME, pg,
  164. )
  165. def reindex_search_txn(txn):
  166. sql = (
  167. "UPDATE event_search AS es SET stream_ordering = e.stream_ordering,"
  168. " origin_server_ts = e.origin_server_ts"
  169. " FROM events AS e"
  170. " WHERE e.event_id = es.event_id"
  171. " AND ? <= e.stream_ordering AND e.stream_ordering < ?"
  172. " RETURNING es.stream_ordering"
  173. )
  174. min_stream_id = max_stream_id - batch_size
  175. txn.execute(sql, (min_stream_id, max_stream_id))
  176. rows = txn.fetchall()
  177. if min_stream_id < target_min_stream_id:
  178. # We've recached the end.
  179. return len(rows), False
  180. progress = {
  181. "target_min_stream_id_inclusive": target_min_stream_id,
  182. "max_stream_id_exclusive": min_stream_id,
  183. "rows_inserted": rows_inserted + len(rows),
  184. "have_added_indexes": True,
  185. }
  186. self._background_update_progress_txn(
  187. txn, self.EVENT_SEARCH_ORDER_UPDATE_NAME, progress
  188. )
  189. return len(rows), True
  190. num_rows, finished = yield self.runInteraction(
  191. self.EVENT_SEARCH_ORDER_UPDATE_NAME, reindex_search_txn
  192. )
  193. if not finished:
  194. yield self._end_background_update(self.EVENT_SEARCH_ORDER_UPDATE_NAME)
  195. defer.returnValue(num_rows)
  196. @defer.inlineCallbacks
  197. def search_msgs(self, room_ids, search_term, keys):
  198. """Performs a full text search over events with given keys.
  199. Args:
  200. room_ids (list): List of room ids to search in
  201. search_term (str): Search term to search for
  202. keys (list): List of keys to search in, currently supports
  203. "content.body", "content.name", "content.topic"
  204. Returns:
  205. list of dicts
  206. """
  207. clauses = []
  208. search_query = search_query = _parse_query(self.database_engine, search_term)
  209. args = []
  210. # Make sure we don't explode because the person is in too many rooms.
  211. # We filter the results below regardless.
  212. if len(room_ids) < 500:
  213. clauses.append(
  214. "room_id IN (%s)" % (",".join(["?"] * len(room_ids)),)
  215. )
  216. args.extend(room_ids)
  217. local_clauses = []
  218. for key in keys:
  219. local_clauses.append("key = ?")
  220. args.append(key)
  221. clauses.append(
  222. "(%s)" % (" OR ".join(local_clauses),)
  223. )
  224. count_args = args
  225. count_clauses = clauses
  226. if isinstance(self.database_engine, PostgresEngine):
  227. sql = (
  228. "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) AS rank,"
  229. " room_id, event_id"
  230. " FROM event_search"
  231. " WHERE vector @@ to_tsquery('english', ?)"
  232. )
  233. args = [search_query, search_query] + args
  234. count_sql = (
  235. "SELECT room_id, count(*) as count FROM event_search"
  236. " WHERE vector @@ to_tsquery('english', ?)"
  237. )
  238. count_args = [search_query] + count_args
  239. elif isinstance(self.database_engine, Sqlite3Engine):
  240. sql = (
  241. "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id"
  242. " FROM event_search"
  243. " WHERE value MATCH ?"
  244. )
  245. args = [search_query] + args
  246. count_sql = (
  247. "SELECT room_id, count(*) as count FROM event_search"
  248. " WHERE value MATCH ?"
  249. )
  250. count_args = [search_term] + count_args
  251. else:
  252. # This should be unreachable.
  253. raise Exception("Unrecognized database engine")
  254. for clause in clauses:
  255. sql += " AND " + clause
  256. for clause in count_clauses:
  257. count_sql += " AND " + clause
  258. # We add an arbitrary limit here to ensure we don't try to pull the
  259. # entire table from the database.
  260. sql += " ORDER BY rank DESC LIMIT 500"
  261. results = yield self._execute(
  262. "search_msgs", self.cursor_to_dict, sql, *args
  263. )
  264. results = filter(lambda row: row["room_id"] in room_ids, results)
  265. events = yield self._get_events([r["event_id"] for r in results])
  266. event_map = {
  267. ev.event_id: ev
  268. for ev in events
  269. }
  270. highlights = None
  271. if isinstance(self.database_engine, PostgresEngine):
  272. highlights = yield self._find_highlights_in_postgres(search_query, events)
  273. count_sql += " GROUP BY room_id"
  274. count_results = yield self._execute(
  275. "search_rooms_count", self.cursor_to_dict, count_sql, *count_args
  276. )
  277. count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)
  278. defer.returnValue({
  279. "results": [
  280. {
  281. "event": event_map[r["event_id"]],
  282. "rank": r["rank"],
  283. }
  284. for r in results
  285. if r["event_id"] in event_map
  286. ],
  287. "highlights": highlights,
  288. "count": count,
  289. })
  290. @defer.inlineCallbacks
  291. def search_rooms(self, room_ids, search_term, keys, limit, pagination_token=None):
  292. """Performs a full text search over events with given keys.
  293. Args:
  294. room_id (list): The room_ids to search in
  295. search_term (str): Search term to search for
  296. keys (list): List of keys to search in, currently supports
  297. "content.body", "content.name", "content.topic"
  298. pagination_token (str): A pagination token previously returned
  299. Returns:
  300. list of dicts
  301. """
  302. clauses = []
  303. search_query = search_query = _parse_query(self.database_engine, search_term)
  304. args = []
  305. # Make sure we don't explode because the person is in too many rooms.
  306. # We filter the results below regardless.
  307. if len(room_ids) < 500:
  308. clauses.append(
  309. "room_id IN (%s)" % (",".join(["?"] * len(room_ids)),)
  310. )
  311. args.extend(room_ids)
  312. local_clauses = []
  313. for key in keys:
  314. local_clauses.append("key = ?")
  315. args.append(key)
  316. clauses.append(
  317. "(%s)" % (" OR ".join(local_clauses),)
  318. )
  319. # take copies of the current args and clauses lists, before adding
  320. # pagination clauses to main query.
  321. count_args = list(args)
  322. count_clauses = list(clauses)
  323. if pagination_token:
  324. try:
  325. origin_server_ts, stream = pagination_token.split(",")
  326. origin_server_ts = int(origin_server_ts)
  327. stream = int(stream)
  328. except:
  329. raise SynapseError(400, "Invalid pagination token")
  330. clauses.append(
  331. "(origin_server_ts < ?"
  332. " OR (origin_server_ts = ? AND stream_ordering < ?))"
  333. )
  334. args.extend([origin_server_ts, origin_server_ts, stream])
  335. if isinstance(self.database_engine, PostgresEngine):
  336. sql = (
  337. "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) as rank,"
  338. " origin_server_ts, stream_ordering, room_id, event_id"
  339. " FROM event_search"
  340. " WHERE vector @@ to_tsquery('english', ?) AND "
  341. )
  342. args = [search_query, search_query] + args
  343. count_sql = (
  344. "SELECT room_id, count(*) as count FROM event_search"
  345. " WHERE vector @@ to_tsquery('english', ?) AND "
  346. )
  347. count_args = [search_query] + count_args
  348. elif isinstance(self.database_engine, Sqlite3Engine):
  349. # We use CROSS JOIN here to ensure we use the right indexes.
  350. # https://sqlite.org/optoverview.html#crossjoin
  351. #
  352. # We want to use the full text search index on event_search to
  353. # extract all possible matches first, then lookup those matches
  354. # in the events table to get the topological ordering. We need
  355. # to use the indexes in this order because sqlite refuses to
  356. # MATCH unless it uses the full text search index
  357. sql = (
  358. "SELECT rank(matchinfo) as rank, room_id, event_id,"
  359. " origin_server_ts, stream_ordering"
  360. " FROM (SELECT key, event_id, matchinfo(event_search) as matchinfo"
  361. " FROM event_search"
  362. " WHERE value MATCH ?"
  363. " )"
  364. " CROSS JOIN events USING (event_id)"
  365. " WHERE "
  366. )
  367. args = [search_query] + args
  368. count_sql = (
  369. "SELECT room_id, count(*) as count FROM event_search"
  370. " WHERE value MATCH ? AND "
  371. )
  372. count_args = [search_term] + count_args
  373. else:
  374. # This should be unreachable.
  375. raise Exception("Unrecognized database engine")
  376. sql += " AND ".join(clauses)
  377. count_sql += " AND ".join(count_clauses)
  378. # We add an arbitrary limit here to ensure we don't try to pull the
  379. # entire table from the database.
  380. if isinstance(self.database_engine, PostgresEngine):
  381. sql += (
  382. " ORDER BY origin_server_ts DESC NULLS LAST,"
  383. " stream_ordering DESC NULLS LAST LIMIT ?"
  384. )
  385. elif isinstance(self.database_engine, Sqlite3Engine):
  386. sql += " ORDER BY origin_server_ts DESC, stream_ordering DESC LIMIT ?"
  387. else:
  388. raise Exception("Unrecognized database engine")
  389. args.append(limit)
  390. results = yield self._execute(
  391. "search_rooms", self.cursor_to_dict, sql, *args
  392. )
  393. results = filter(lambda row: row["room_id"] in room_ids, results)
  394. events = yield self._get_events([r["event_id"] for r in results])
  395. event_map = {
  396. ev.event_id: ev
  397. for ev in events
  398. }
  399. highlights = None
  400. if isinstance(self.database_engine, PostgresEngine):
  401. highlights = yield self._find_highlights_in_postgres(search_query, events)
  402. count_sql += " GROUP BY room_id"
  403. count_results = yield self._execute(
  404. "search_rooms_count", self.cursor_to_dict, count_sql, *count_args
  405. )
  406. count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)
  407. defer.returnValue({
  408. "results": [
  409. {
  410. "event": event_map[r["event_id"]],
  411. "rank": r["rank"],
  412. "pagination_token": "%s,%s" % (
  413. r["origin_server_ts"], r["stream_ordering"]
  414. ),
  415. }
  416. for r in results
  417. if r["event_id"] in event_map
  418. ],
  419. "highlights": highlights,
  420. "count": count,
  421. })
  422. def _find_highlights_in_postgres(self, search_query, events):
  423. """Given a list of events and a search term, return a list of words
  424. that match from the content of the event.
  425. This is used to give a list of words that clients can match against to
  426. highlight the matching parts.
  427. Args:
  428. search_query (str)
  429. events (list): A list of events
  430. Returns:
  431. deferred : A set of strings.
  432. """
  433. def f(txn):
  434. highlight_words = set()
  435. for event in events:
  436. # As a hack we simply join values of all possible keys. This is
  437. # fine since we're only using them to find possible highlights.
  438. values = []
  439. for key in ("body", "name", "topic"):
  440. v = event.content.get(key, None)
  441. if v:
  442. values.append(v)
  443. if not values:
  444. continue
  445. value = " ".join(values)
  446. # We need to find some values for StartSel and StopSel that
  447. # aren't in the value so that we can pick results out.
  448. start_sel = "<"
  449. stop_sel = ">"
  450. while start_sel in value:
  451. start_sel += "<"
  452. while stop_sel in value:
  453. stop_sel += ">"
  454. query = "SELECT ts_headline(?, to_tsquery('english', ?), %s)" % (
  455. _to_postgres_options({
  456. "StartSel": start_sel,
  457. "StopSel": stop_sel,
  458. "MaxFragments": "50",
  459. })
  460. )
  461. txn.execute(query, (value, search_query,))
  462. headline, = txn.fetchall()[0]
  463. # Now we need to pick the possible highlights out of the haedline
  464. # result.
  465. matcher_regex = "%s(.*?)%s" % (
  466. re.escape(start_sel),
  467. re.escape(stop_sel),
  468. )
  469. res = re.findall(matcher_regex, headline)
  470. highlight_words.update([r.lower() for r in res])
  471. return highlight_words
  472. return self.runInteraction("_find_highlights", f)
  473. def _to_postgres_options(options_dict):
  474. return "'%s'" % (
  475. ",".join("%s=%s" % (k, v) for k, v in options_dict.items()),
  476. )
  477. def _parse_query(database_engine, search_term):
  478. """Takes a plain unicode string from the user and converts it into a form
  479. that can be passed to database.
  480. We use this so that we can add prefix matching, which isn't something
  481. that is supported by default.
  482. """
  483. # Pull out the individual words, discarding any non-word characters.
  484. results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
  485. if isinstance(database_engine, PostgresEngine):
  486. return " & ".join(result + ":*" for result in results)
  487. elif isinstance(database_engine, Sqlite3Engine):
  488. return " & ".join(result + "*" for result in results)
  489. else:
  490. # This should be unreachable.
  491. raise Exception("Unrecognized database engine")