test_room_search.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. # Copyright 2021 The Matrix.org Foundation C.I.C.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from typing import List, Tuple
  15. from unittest.case import SkipTest
  16. from twisted.test.proto_helpers import MemoryReactor
  17. import synapse.rest.admin
  18. from synapse.api.constants import EventTypes
  19. from synapse.api.errors import StoreError
  20. from synapse.rest.client import login, room
  21. from synapse.server import HomeServer
  22. from synapse.storage.databases.main import DataStore
  23. from synapse.storage.databases.main.search import Phrase, SearchToken, _tokenize_query
  24. from synapse.storage.engines import PostgresEngine
  25. from synapse.storage.engines.sqlite import Sqlite3Engine
  26. from synapse.util import Clock
  27. from tests.unittest import HomeserverTestCase, skip_unless
  28. from tests.utils import USE_POSTGRES_FOR_TESTS
  29. class EventSearchInsertionTest(HomeserverTestCase):
  30. servlets = [
  31. synapse.rest.admin.register_servlets_for_client_rest_resource,
  32. login.register_servlets,
  33. room.register_servlets,
  34. ]
  35. def test_null_byte(self) -> None:
  36. """
  37. Postgres/SQLite don't like null bytes going into the search tables. Internally
  38. we replace those with a space.
  39. Ensure this doesn't break anything.
  40. """
  41. # Register a user and create a room, create some messages
  42. self.register_user("alice", "password")
  43. access_token = self.login("alice", "password")
  44. room_id = self.helper.create_room_as("alice", tok=access_token)
  45. # Send messages and ensure they don't cause an internal server
  46. # error
  47. for body in ["hi\u0000bob", "another message", "hi alice"]:
  48. response = self.helper.send(room_id, body, tok=access_token)
  49. self.assertIn("event_id", response)
  50. # Check that search works for the message where the null byte was replaced
  51. store = self.hs.get_datastores().main
  52. result = self.get_success(
  53. store.search_msgs([room_id], "hi bob", ["content.body"])
  54. )
  55. self.assertEqual(result.get("count"), 1)
  56. if isinstance(store.database_engine, PostgresEngine):
  57. self.assertIn("hi", result.get("highlights"))
  58. self.assertIn("bob", result.get("highlights"))
  59. # Check that search works for an unrelated message
  60. result = self.get_success(
  61. store.search_msgs([room_id], "another", ["content.body"])
  62. )
  63. self.assertEqual(result.get("count"), 1)
  64. if isinstance(store.database_engine, PostgresEngine):
  65. self.assertIn("another", result.get("highlights"))
  66. # Check that search works for a search term that overlaps with the message
  67. # containing a null byte and an unrelated message.
  68. result = self.get_success(store.search_msgs([room_id], "hi", ["content.body"]))
  69. self.assertEqual(result.get("count"), 2)
  70. result = self.get_success(
  71. store.search_msgs([room_id], "hi alice", ["content.body"])
  72. )
  73. if isinstance(store.database_engine, PostgresEngine):
  74. self.assertIn("alice", result.get("highlights"))
  75. def test_non_string(self) -> None:
  76. """Test that non-string `value`s are not inserted into `event_search`.
  77. This is particularly important when using sqlite, since a sqlite column can hold
  78. both strings and integers. When using Postgres, integers are automatically
  79. converted to strings.
  80. Regression test for #11918.
  81. """
  82. store = self.hs.get_datastores().main
  83. # Register a user and create a room
  84. user_id = self.register_user("alice", "password")
  85. access_token = self.login("alice", "password")
  86. room_id = self.helper.create_room_as("alice", tok=access_token)
  87. room_version = self.get_success(store.get_room_version(room_id))
  88. # Construct a message with a numeric body to be received over federation
  89. # The message can't be sent using the client API, since Synapse's event
  90. # validation will reject it.
  91. prev_event_ids = self.get_success(store.get_prev_events_for_room(room_id))
  92. prev_event = self.get_success(store.get_event(prev_event_ids[0]))
  93. prev_state_map = self.get_success(
  94. self.hs.get_storage_controllers().state.get_state_ids_for_event(
  95. prev_event_ids[0]
  96. )
  97. )
  98. event_dict = {
  99. "type": EventTypes.Message,
  100. "content": {"msgtype": "m.text", "body": 2},
  101. "room_id": room_id,
  102. "sender": user_id,
  103. "prev_events": prev_event_ids,
  104. "origin_server_ts": self.clock.time_msec(),
  105. }
  106. builder = self.hs.get_event_builder_factory().for_room_version(
  107. room_version, event_dict
  108. )
  109. event = self.get_success(
  110. builder.build(
  111. prev_event_ids=prev_event_ids,
  112. auth_event_ids=self.hs.get_event_auth_handler().compute_auth_events(
  113. builder,
  114. prev_state_map,
  115. for_verification=False,
  116. ),
  117. depth=prev_event.depth + 1,
  118. )
  119. )
  120. # Receive the event
  121. self.get_success(
  122. self.hs.get_federation_event_handler().on_receive_pdu(
  123. self.hs.hostname, event
  124. )
  125. )
  126. # The event should not have an entry in the `event_search` table
  127. f = self.get_failure(
  128. store.db_pool.simple_select_one_onecol(
  129. "event_search",
  130. {"room_id": room_id, "event_id": event.event_id},
  131. "event_id",
  132. ),
  133. StoreError,
  134. )
  135. self.assertEqual(f.value.code, 404)
  136. @skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite")
  137. def test_sqlite_non_string_deletion_background_update(self) -> None:
  138. """Test the background update to delete bad rows from `event_search`."""
  139. store = self.hs.get_datastores().main
  140. # Populate `event_search` with dummy data
  141. self.get_success(
  142. store.db_pool.simple_insert_many(
  143. "event_search",
  144. keys=["event_id", "room_id", "key", "value"],
  145. values=[
  146. ("event1", "room_id", "content.body", "hi"),
  147. ("event2", "room_id", "content.body", "2"),
  148. ("event3", "room_id", "content.body", 3),
  149. ],
  150. desc="populate_event_search",
  151. )
  152. )
  153. # Run the background update
  154. store.db_pool.updates._all_done = False
  155. self.get_success(
  156. store.db_pool.simple_insert(
  157. "background_updates",
  158. {
  159. "update_name": "event_search_sqlite_delete_non_strings",
  160. "progress_json": "{}",
  161. },
  162. )
  163. )
  164. self.wait_for_background_updates()
  165. # The non-string `value`s ought to be gone now.
  166. values = self.get_success(
  167. store.db_pool.simple_select_onecol(
  168. "event_search",
  169. {"room_id": "room_id"},
  170. "value",
  171. ),
  172. )
  173. self.assertCountEqual(values, ["hi", "2"])
  174. class MessageSearchTest(HomeserverTestCase):
  175. """
  176. Check message search.
  177. A powerful way to check the behaviour is to run the following in Postgres >= 11:
  178. # SELECT websearch_to_tsquery('english', <your string>);
  179. The result can be compared to the tokenized version for SQLite and Postgres < 11.
  180. """
  181. servlets = [
  182. synapse.rest.admin.register_servlets_for_client_rest_resource,
  183. login.register_servlets,
  184. room.register_servlets,
  185. ]
  186. PHRASE = "the quick brown fox jumps over the lazy dog"
  187. # Each entry is a search query, followed by a boolean of whether it is in the phrase.
  188. COMMON_CASES = [
  189. ("nope", False),
  190. ("brown", True),
  191. ("quick brown", True),
  192. ("brown quick", True),
  193. ("quick \t brown", True),
  194. ("jump", True),
  195. ("brown nope", False),
  196. ('"brown quick"', False),
  197. ('"jumps over"', True),
  198. ('"quick fox"', False),
  199. ("nope OR doublenope", False),
  200. ("furphy OR fox", True),
  201. ("fox -nope", True),
  202. ("fox -brown", False),
  203. ('"fox" quick', True),
  204. ('"quick brown', True),
  205. ('" quick "', True),
  206. ('" nope"', False),
  207. ]
  208. # TODO Test non-ASCII cases.
  209. # Case that fail on SQLite.
  210. POSTGRES_CASES = [
  211. # SQLite treats NOT as a binary operator.
  212. ("- fox", False),
  213. ("- nope", True),
  214. ('"-fox quick', False),
  215. # PostgreSQL skips stop words.
  216. ('"the quick brown"', True),
  217. ('"over lazy"', True),
  218. ]
  219. def prepare(
  220. self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer
  221. ) -> None:
  222. # Register a user and create a room, create some messages
  223. self.register_user("alice", "password")
  224. self.access_token = self.login("alice", "password")
  225. self.room_id = self.helper.create_room_as("alice", tok=self.access_token)
  226. # Send the phrase as a message and check it was created
  227. response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token)
  228. self.assertIn("event_id", response)
  229. # The behaviour of a missing trailing double quote changed in PostgreSQL 14
  230. # from ignoring the initial double quote to treating it as a phrase.
  231. main_store = homeserver.get_datastores().main
  232. found = False
  233. if isinstance(main_store.database_engine, PostgresEngine):
  234. assert main_store.database_engine._version is not None
  235. found = main_store.database_engine._version < 140000
  236. self.COMMON_CASES.append(('"fox quick', found))
  237. def test_tokenize_query(self) -> None:
  238. """Test the custom logic to tokenize a user's query."""
  239. cases = (
  240. ("brown", ["brown"]),
  241. ("quick brown", ["quick", SearchToken.And, "brown"]),
  242. ("quick \t brown", ["quick", SearchToken.And, "brown"]),
  243. ('"brown quick"', [Phrase(["brown", "quick"])]),
  244. ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]),
  245. ("fox -brown", ["fox", SearchToken.Not, "brown"]),
  246. ("- fox", [SearchToken.Not, "fox"]),
  247. ('"fox" quick', [Phrase(["fox"]), SearchToken.And, "quick"]),
  248. # No trailing double quote.
  249. ('"fox quick', [Phrase(["fox", "quick"])]),
  250. ('"-fox quick', [Phrase(["-fox", "quick"])]),
  251. ('" quick "', [Phrase(["quick"])]),
  252. (
  253. 'q"uick brow"n',
  254. [
  255. "q",
  256. SearchToken.And,
  257. Phrase(["uick", "brow"]),
  258. SearchToken.And,
  259. "n",
  260. ],
  261. ),
  262. (
  263. '-"quick brown"',
  264. [SearchToken.Not, Phrase(["quick", "brown"])],
  265. ),
  266. )
  267. for query, expected in cases:
  268. tokenized = _tokenize_query(query)
  269. self.assertEqual(
  270. tokenized, expected, f"{tokenized} != {expected} for {query}"
  271. )
  272. def _check_test_cases(
  273. self, store: DataStore, cases: List[Tuple[str, bool]]
  274. ) -> None:
  275. # Run all the test cases versus search_msgs
  276. for query, expect_to_contain in cases:
  277. result = self.get_success(
  278. store.search_msgs([self.room_id], query, ["content.body"])
  279. )
  280. self.assertEquals(
  281. result["count"],
  282. 1 if expect_to_contain else 0,
  283. f"expected '{query}' to match '{self.PHRASE}'"
  284. if expect_to_contain
  285. else f"'{query}' unexpectedly matched '{self.PHRASE}'",
  286. )
  287. self.assertEquals(
  288. len(result["results"]),
  289. 1 if expect_to_contain else 0,
  290. "results array length should match count",
  291. )
  292. # Run them again versus search_rooms
  293. for query, expect_to_contain in cases:
  294. result = self.get_success(
  295. store.search_rooms([self.room_id], query, ["content.body"], 10)
  296. )
  297. self.assertEquals(
  298. result["count"],
  299. 1 if expect_to_contain else 0,
  300. f"expected '{query}' to match '{self.PHRASE}'"
  301. if expect_to_contain
  302. else f"'{query}' unexpectedly matched '{self.PHRASE}'",
  303. )
  304. self.assertEquals(
  305. len(result["results"]),
  306. 1 if expect_to_contain else 0,
  307. "results array length should match count",
  308. )
  309. def test_postgres_web_search_for_phrase(self) -> None:
  310. """
  311. Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery.
  312. This test is skipped unless the postgres instance supports websearch_to_tsquery.
  313. See https://www.postgresql.org/docs/current/textsearch-controls.html
  314. """
  315. store = self.hs.get_datastores().main
  316. if not isinstance(store.database_engine, PostgresEngine):
  317. raise SkipTest("Test only applies when postgres is used as the database")
  318. self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES)
  319. def test_sqlite_search(self) -> None:
  320. """
  321. Test sqlite searching for phrases.
  322. """
  323. store = self.hs.get_datastores().main
  324. if not isinstance(store.database_engine, Sqlite3Engine):
  325. raise SkipTest("Test only applies when sqlite is used as the database")
  326. self._check_test_cases(store, self.COMMON_CASES)