123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711 |
- # Copyright 2018-2021 The Matrix.org Foundation C.I.C.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import re
- from typing import Any, Dict, Set, Tuple
- from unittest import mock
- from unittest.mock import Mock, patch
- from twisted.test.proto_helpers import MemoryReactor
- from synapse.api.constants import EventTypes, Membership, UserTypes
- from synapse.appservice import ApplicationService
- from synapse.rest import admin
- from synapse.rest.client import login, register, room
- from synapse.server import HomeServer
- from synapse.storage import DataStore
- from synapse.storage.background_updates import _BackgroundUpdateHandler
- from synapse.storage.databases.main import user_directory
- from synapse.storage.databases.main.user_directory import (
- _parse_words_with_icu,
- _parse_words_with_regex,
- )
- from synapse.storage.roommember import ProfileInfo
- from synapse.util import Clock
- from tests.server import ThreadedMemoryReactorClock
- from tests.test_utils.event_injection import inject_member_event
- from tests.unittest import HomeserverTestCase, override_config
- try:
- import icu
- except ImportError:
- icu = None # type: ignore
- ALICE = "@alice:a"
- BOB = "@bob:b"
- BOBBY = "@bobby:a"
- # The localpart isn't 'Bela' on purpose so we can test looking up display names.
- BELA = "@somenickname:example.org"
- class GetUserDirectoryTables:
- """Helper functions that we want to reuse in tests/handlers/test_user_directory.py"""
- def __init__(self, store: DataStore):
- self.store = store
- async def get_users_in_public_rooms(self) -> Set[Tuple[str, str]]:
- """Fetch the entire `users_in_public_rooms` table.
- Returns a list of tuples (user_id, room_id) where room_id is public and
- contains the user with the given id.
- """
- r = await self.store.db_pool.simple_select_list(
- "users_in_public_rooms", None, ("user_id", "room_id")
- )
- retval = set()
- for i in r:
- retval.add((i["user_id"], i["room_id"]))
- return retval
- async def get_users_who_share_private_rooms(self) -> Set[Tuple[str, str, str]]:
- """Fetch the entire `users_who_share_private_rooms` table.
- Returns a set of tuples (user_id, other_user_id, room_id) corresponding
- to the rows of `users_who_share_private_rooms`.
- """
- rows = await self.store.db_pool.simple_select_list(
- "users_who_share_private_rooms",
- None,
- ["user_id", "other_user_id", "room_id"],
- )
- rv = set()
- for row in rows:
- rv.add((row["user_id"], row["other_user_id"], row["room_id"]))
- return rv
- async def get_users_in_user_directory(self) -> Set[str]:
- """Fetch the set of users in the `user_directory` table.
- This is useful when checking we've correctly excluded users from the directory.
- """
- result = await self.store.db_pool.simple_select_list(
- "user_directory",
- None,
- ["user_id"],
- )
- return {row["user_id"] for row in result}
- async def get_profiles_in_user_directory(self) -> Dict[str, ProfileInfo]:
- """Fetch users and their profiles from the `user_directory` table.
- This is useful when we want to inspect display names and avatars.
- It's almost the entire contents of the `user_directory` table: the only
- thing missing is an unused room_id column.
- """
- rows = await self.store.db_pool.simple_select_list(
- "user_directory",
- None,
- ("user_id", "display_name", "avatar_url"),
- )
- return {
- row["user_id"]: ProfileInfo(
- display_name=row["display_name"], avatar_url=row["avatar_url"]
- )
- for row in rows
- }
- async def get_tables(
- self,
- ) -> Tuple[Set[str], Set[Tuple[str, str]], Set[Tuple[str, str, str]]]:
- """Multiple tests want to inspect these tables, so expose them together."""
- return (
- await self.get_users_in_user_directory(),
- await self.get_users_in_public_rooms(),
- await self.get_users_who_share_private_rooms(),
- )
- class UserDirectoryInitialPopulationTestcase(HomeserverTestCase):
- """Ensure that rebuilding the directory writes the correct data to the DB.
- See also tests/handlers/test_user_directory.py for similar checks. They
- test the incremental updates, rather than the big rebuild.
- """
- servlets = [
- login.register_servlets,
- admin.register_servlets,
- room.register_servlets,
- register.register_servlets,
- ]
- def make_homeserver(
- self, reactor: ThreadedMemoryReactorClock, clock: Clock
- ) -> HomeServer:
- self.appservice = ApplicationService(
- token="i_am_an_app_service",
- id="1234",
- namespaces={"users": [{"regex": r"@as_user.*", "exclusive": True}]},
- sender="@as:test",
- )
- mock_load_appservices = Mock(return_value=[self.appservice])
- with patch(
- "synapse.storage.databases.main.appservice.load_appservices",
- mock_load_appservices,
- ):
- hs = super().make_homeserver(reactor, clock)
- return hs
- def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
- self.store = hs.get_datastores().main
- self.user_dir_helper = GetUserDirectoryTables(self.store)
- def _purge_and_rebuild_user_dir(self) -> None:
- """Nuke the user directory tables, start the background process to
- repopulate them, and wait for the process to complete. This allows us
- to inspect the outcome of the background process alone, without any of
- the other incremental updates.
- """
- self.get_success(self.store.update_user_directory_stream_pos(None))
- self.get_success(self.store.delete_all_from_user_dir())
- shares_private = self.get_success(
- self.user_dir_helper.get_users_who_share_private_rooms()
- )
- public_users = self.get_success(
- self.user_dir_helper.get_users_in_public_rooms()
- )
- # Nothing updated yet
- self.assertEqual(shares_private, set())
- self.assertEqual(public_users, set())
- # Ugh, have to reset this flag
- self.store.db_pool.updates._all_done = False
- self.get_success(
- self.store.db_pool.simple_insert(
- "background_updates",
- {
- "update_name": "populate_user_directory_createtables",
- "progress_json": "{}",
- },
- )
- )
- self.get_success(
- self.store.db_pool.simple_insert(
- "background_updates",
- {
- "update_name": "populate_user_directory_process_rooms",
- "progress_json": "{}",
- "depends_on": "populate_user_directory_createtables",
- },
- )
- )
- self.get_success(
- self.store.db_pool.simple_insert(
- "background_updates",
- {
- "update_name": "populate_user_directory_process_users",
- "progress_json": "{}",
- "depends_on": "populate_user_directory_process_rooms",
- },
- )
- )
- self.get_success(
- self.store.db_pool.simple_insert(
- "background_updates",
- {
- "update_name": "populate_user_directory_cleanup",
- "progress_json": "{}",
- "depends_on": "populate_user_directory_process_users",
- },
- )
- )
- self.wait_for_background_updates()
- def test_initial(self) -> None:
- """
- The user directory's initial handler correctly updates the search tables.
- """
- u1 = self.register_user("user1", "pass")
- u1_token = self.login(u1, "pass")
- u2 = self.register_user("user2", "pass")
- u2_token = self.login(u2, "pass")
- u3 = self.register_user("user3", "pass")
- u3_token = self.login(u3, "pass")
- room = self.helper.create_room_as(u1, is_public=True, tok=u1_token)
- self.helper.invite(room, src=u1, targ=u2, tok=u1_token)
- self.helper.join(room, user=u2, tok=u2_token)
- private_room = self.helper.create_room_as(u1, is_public=False, tok=u1_token)
- self.helper.invite(private_room, src=u1, targ=u3, tok=u1_token)
- self.helper.join(private_room, user=u3, tok=u3_token)
- # Do the initial population of the user directory via the background update
- self._purge_and_rebuild_user_dir()
- users, in_public, in_private = self.get_success(
- self.user_dir_helper.get_tables()
- )
- # User 1 and User 2 are in the same public room
- self.assertEqual(in_public, {(u1, room), (u2, room)})
- # User 1 and User 3 share private rooms
- self.assertEqual(in_private, {(u1, u3, private_room), (u3, u1, private_room)})
- # All three should have entries in the directory
- self.assertEqual(users, {u1, u2, u3})
- # The next four tests (test_population_excludes_*) all set up
- # - A normal user included in the user dir
- # - A public and private room created by that user
- # - A user excluded from the room dir, belonging to both rooms
- # They match similar logic in handlers/test_user_directory.py But that tests
- # updating the directory; this tests rebuilding it from scratch.
- def _create_rooms_and_inject_memberships(
- self, creator: str, token: str, joiner: str
- ) -> Tuple[str, str]:
- """Create a public and private room as a normal user.
- Then get the `joiner` into those rooms.
- """
- public_room = self.helper.create_room_as(
- creator,
- is_public=True,
- # See https://github.com/matrix-org/synapse/issues/10951
- extra_content={"visibility": "public"},
- tok=token,
- )
- private_room = self.helper.create_room_as(creator, is_public=False, tok=token)
- # HACK: get the user into these rooms
- self.get_success(inject_member_event(self.hs, public_room, joiner, "join"))
- self.get_success(inject_member_event(self.hs, private_room, joiner, "join"))
- return public_room, private_room
- def _check_room_sharing_tables(
- self, normal_user: str, public_room: str, private_room: str
- ) -> None:
- # After rebuilding the directory, we should only see the normal user.
- users, in_public, in_private = self.get_success(
- self.user_dir_helper.get_tables()
- )
- self.assertEqual(users, {normal_user})
- self.assertEqual(in_public, {(normal_user, public_room)})
- self.assertEqual(in_private, set())
- def test_population_excludes_support_user(self) -> None:
- # Create a normal and support user.
- user = self.register_user("user", "pass")
- token = self.login(user, "pass")
- support = "@support1:test"
- self.get_success(
- self.store.register_user(
- user_id=support, password_hash=None, user_type=UserTypes.SUPPORT
- )
- )
- # Join the support user to rooms owned by the normal user.
- public, private = self._create_rooms_and_inject_memberships(
- user, token, support
- )
- # Rebuild the directory.
- self._purge_and_rebuild_user_dir()
- # Check the support user is not in the directory.
- self._check_room_sharing_tables(user, public, private)
- def test_population_excludes_deactivated_user(self) -> None:
- user = self.register_user("naughty", "pass")
- admin = self.register_user("admin", "pass", admin=True)
- admin_token = self.login(admin, "pass")
- # Deactivate the user.
- channel = self.make_request(
- "PUT",
- f"/_synapse/admin/v2/users/{user}",
- access_token=admin_token,
- content={"deactivated": True},
- )
- self.assertEqual(channel.code, 200)
- self.assertEqual(channel.json_body["deactivated"], True)
- # Join the deactivated user to rooms owned by the admin.
- # Is this something that could actually happen outside of a test?
- public, private = self._create_rooms_and_inject_memberships(
- admin, admin_token, user
- )
- # Rebuild the user dir. The deactivated user should be missing.
- self._purge_and_rebuild_user_dir()
- self._check_room_sharing_tables(admin, public, private)
- def test_population_excludes_appservice_user(self) -> None:
- # Register an AS user.
- user = self.register_user("user", "pass")
- token = self.login(user, "pass")
- as_user, _ = self.register_appservice_user(
- "as_user_potato", self.appservice.token
- )
- # Join the AS user to rooms owned by the normal user.
- public, private = self._create_rooms_and_inject_memberships(
- user, token, as_user
- )
- # Rebuild the directory.
- self._purge_and_rebuild_user_dir()
- # Check the AS user is not in the directory.
- self._check_room_sharing_tables(user, public, private)
- def test_population_excludes_appservice_sender(self) -> None:
- user = self.register_user("user", "pass")
- token = self.login(user, "pass")
- # Join the AS sender to rooms owned by the normal user.
- public, private = self._create_rooms_and_inject_memberships(
- user, token, self.appservice.sender
- )
- # Rebuild the directory.
- self._purge_and_rebuild_user_dir()
- # Check the AS sender is not in the directory.
- self._check_room_sharing_tables(user, public, private)
- def test_population_conceals_private_nickname(self) -> None:
- # Make a private room, and set a nickname within
- user = self.register_user("aaaa", "pass")
- user_token = self.login(user, "pass")
- private_room = self.helper.create_room_as(user, is_public=False, tok=user_token)
- self.helper.send_state(
- private_room,
- EventTypes.Member,
- state_key=user,
- body={"membership": Membership.JOIN, "displayname": "BBBB"},
- tok=user_token,
- )
- # Rebuild the user directory. Make the rescan of the `users` table a no-op
- # so we only see the effect of scanning the `room_memberships` table.
- async def mocked_process_users(*args: Any, **kwargs: Any) -> int:
- await self.store.db_pool.updates._end_background_update(
- "populate_user_directory_process_users"
- )
- return 1
- with mock.patch.dict(
- self.store.db_pool.updates._background_update_handlers,
- populate_user_directory_process_users=_BackgroundUpdateHandler(
- mocked_process_users,
- ),
- ):
- self._purge_and_rebuild_user_dir()
- # Local users are ignored by the scan over rooms
- users = self.get_success(self.user_dir_helper.get_profiles_in_user_directory())
- self.assertEqual(users, {})
- # Do a full rebuild including the scan over the `users` table. The local
- # user should appear with their profile name.
- self._purge_and_rebuild_user_dir()
- users = self.get_success(self.user_dir_helper.get_profiles_in_user_directory())
- self.assertEqual(
- users, {user: ProfileInfo(display_name="aaaa", avatar_url=None)}
- )
- class UserDirectoryStoreTestCase(HomeserverTestCase):
- use_icu = False
- def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
- self.store = hs.get_datastores().main
- # alice and bob are both in !room_id. bobby is not but shares
- # a homeserver with alice.
- self.get_success(self.store.update_profile_in_user_dir(ALICE, "alice", None))
- self.get_success(self.store.update_profile_in_user_dir(BOB, "bob", None))
- self.get_success(self.store.update_profile_in_user_dir(BOBBY, "bobby", None))
- self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None))
- self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB)))
- self._restore_use_icu = user_directory.USE_ICU
- user_directory.USE_ICU = self.use_icu
- def tearDown(self) -> None:
- user_directory.USE_ICU = self._restore_use_icu
- def test_search_user_dir(self) -> None:
- # normally when alice searches the directory she should just find
- # bob because bobby doesn't share a room with her.
- r = self.get_success(self.store.search_user_dir(ALICE, "bob", 10))
- self.assertFalse(r["limited"])
- self.assertEqual(1, len(r["results"]))
- self.assertDictEqual(
- r["results"][0], {"user_id": BOB, "display_name": "bob", "avatar_url": None}
- )
- @override_config({"user_directory": {"search_all_users": True}})
- def test_search_user_dir_all_users(self) -> None:
- r = self.get_success(self.store.search_user_dir(ALICE, "bob", 10))
- self.assertFalse(r["limited"])
- self.assertEqual(2, len(r["results"]))
- self.assertDictEqual(
- r["results"][0],
- {"user_id": BOB, "display_name": "bob", "avatar_url": None},
- )
- self.assertDictEqual(
- r["results"][1],
- {"user_id": BOBBY, "display_name": "bobby", "avatar_url": None},
- )
- @override_config({"user_directory": {"search_all_users": True}})
- def test_search_user_limit_correct(self) -> None:
- r = self.get_success(self.store.search_user_dir(ALICE, "bob", 1))
- self.assertTrue(r["limited"])
- self.assertEqual(1, len(r["results"]))
- @override_config({"user_directory": {"search_all_users": True}})
- def test_search_user_dir_stop_words(self) -> None:
- """Tests that a user can look up another user by searching for the start if its
- display name even if that name happens to be a common English word that would
- usually be ignored in full text searches.
- """
- r = self.get_success(self.store.search_user_dir(ALICE, "be", 10))
- self.assertFalse(r["limited"])
- self.assertEqual(1, len(r["results"]))
- self.assertDictEqual(
- r["results"][0],
- {"user_id": BELA, "display_name": "Bela", "avatar_url": None},
- )
- @override_config({"user_directory": {"search_all_users": True}})
- def test_search_user_dir_start_of_user_id(self) -> None:
- """Tests that a user can look up another user by searching for the start
- of their user ID.
- """
- r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10))
- self.assertFalse(r["limited"])
- self.assertEqual(1, len(r["results"]))
- self.assertDictEqual(
- r["results"][0],
- {"user_id": BELA, "display_name": "Bela", "avatar_url": None},
- )
- @override_config({"user_directory": {"search_all_users": True}})
- def test_search_user_dir_ascii_case_insensitivity(self) -> None:
- """Tests that a user can look up another user by searching for their name in a
- different case.
- """
- CHARLIE = "@someuser:example.org"
- self.get_success(
- self.store.update_profile_in_user_dir(CHARLIE, "Charlie", None)
- )
- r = self.get_success(self.store.search_user_dir(ALICE, "cHARLIE", 10))
- self.assertFalse(r["limited"])
- self.assertEqual(1, len(r["results"]))
- self.assertDictEqual(
- r["results"][0],
- {"user_id": CHARLIE, "display_name": "Charlie", "avatar_url": None},
- )
- @override_config({"user_directory": {"search_all_users": True}})
- def test_search_user_dir_unicode_case_insensitivity(self) -> None:
- """Tests that a user can look up another user by searching for their name in a
- different case.
- """
- IVAN = "@someuser:example.org"
- self.get_success(self.store.update_profile_in_user_dir(IVAN, "Иван", None))
- r = self.get_success(self.store.search_user_dir(ALICE, "иВАН", 10))
- self.assertFalse(r["limited"])
- self.assertEqual(1, len(r["results"]))
- self.assertDictEqual(
- r["results"][0],
- {"user_id": IVAN, "display_name": "Иван", "avatar_url": None},
- )
- @override_config({"user_directory": {"search_all_users": True}})
- def test_search_user_dir_dotted_dotless_i_case_insensitivity(self) -> None:
- """Tests that a user can look up another user by searching for their name in a
- different case, when their name contains dotted or dotless "i"s.
- Some languages have dotted and dotless versions of "i", which are considered to
- be different letters: i <-> İ, ı <-> I. To make things difficult, they reuse the
- ASCII "i" and "I" code points, despite having different lowercase / uppercase
- forms.
- """
- USER = "@someuser:example.org"
- expected_matches = [
- # (search_term, display_name)
- # A search for "i" should match "İ".
- ("iiiii", "İİİİİ"),
- # A search for "I" should match "ı".
- ("IIIII", "ııııı"),
- # A search for "ı" should match "I".
- ("ııııı", "IIIII"),
- # A search for "İ" should match "i".
- ("İİİİİ", "iiiii"),
- ]
- for search_term, display_name in expected_matches:
- self.get_success(
- self.store.update_profile_in_user_dir(USER, display_name, None)
- )
- r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10))
- self.assertFalse(r["limited"])
- self.assertEqual(
- 1,
- len(r["results"]),
- f"searching for {search_term!r} did not match {display_name!r}",
- )
- self.assertDictEqual(
- r["results"][0],
- {"user_id": USER, "display_name": display_name, "avatar_url": None},
- )
- # We don't test for negative matches, to allow implementations that consider all
- # the i variants to be the same.
- test_search_user_dir_dotted_dotless_i_case_insensitivity.skip = "not supported" # type: ignore
- @override_config({"user_directory": {"search_all_users": True}})
- def test_search_user_dir_unicode_normalization(self) -> None:
- """Tests that a user can look up another user by searching for their name with
- either composed or decomposed accents.
- """
- AMELIE = "@someuser:example.org"
- expected_matches = [
- # (search_term, display_name)
- ("Ame\u0301lie", "Amélie"),
- ("Amélie", "Ame\u0301lie"),
- ]
- for search_term, display_name in expected_matches:
- self.get_success(
- self.store.update_profile_in_user_dir(AMELIE, display_name, None)
- )
- r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10))
- self.assertFalse(r["limited"])
- self.assertEqual(
- 1,
- len(r["results"]),
- f"searching for {search_term!r} did not match {display_name!r}",
- )
- self.assertDictEqual(
- r["results"][0],
- {"user_id": AMELIE, "display_name": display_name, "avatar_url": None},
- )
- @override_config({"user_directory": {"search_all_users": True}})
- def test_search_user_dir_accent_insensitivity(self) -> None:
- """Tests that a user can look up another user by searching for their name
- without any accents.
- """
- AMELIE = "@someuser:example.org"
- self.get_success(self.store.update_profile_in_user_dir(AMELIE, "Amélie", None))
- r = self.get_success(self.store.search_user_dir(ALICE, "amelie", 10))
- self.assertFalse(r["limited"])
- self.assertEqual(1, len(r["results"]))
- self.assertDictEqual(
- r["results"][0],
- {"user_id": AMELIE, "display_name": "Amélie", "avatar_url": None},
- )
- # It may be desirable for "é"s in search terms to not match plain "e"s and we
- # really don't want "é"s in search terms to match "e"s with different accents.
- # But we don't test for this to allow implementations that consider all
- # "e"-lookalikes to be the same.
- test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore
- class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
- use_icu = True
- if not icu:
- skip = "Requires PyICU"
- class UserDirectoryICUTestCase(HomeserverTestCase):
- if not icu:
- skip = "Requires PyICU"
- def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
- self.store = hs.get_datastores().main
- self.user_dir_helper = GetUserDirectoryTables(self.store)
- def test_icu_word_boundary(self) -> None:
- """Tests that we correctly detect word boundaries when ICU (International
- Components for Unicode) support is available.
- """
- display_name = "Gáo"
- # This word is not broken down correctly by Python's regular expressions,
- # likely because á is actually a lowercase a followed by a U+0301 combining
- # acute accent. This is specifically something that ICU support fixes.
- matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE)
- self.assertEqual(len(matches), 2)
- self.get_success(
- self.store.update_profile_in_user_dir(ALICE, display_name, None)
- )
- self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,)))
- # Check that searching for this user yields the correct result.
- r = self.get_success(self.store.search_user_dir(BOB, display_name, 10))
- self.assertFalse(r["limited"])
- self.assertEqual(len(r["results"]), 1)
- self.assertDictEqual(
- r["results"][0],
- {"user_id": ALICE, "display_name": display_name, "avatar_url": None},
- )
- def test_icu_word_boundary_punctuation(self) -> None:
- """
- Tests the behaviour of punctuation with the ICU tokeniser.
- Seems to depend on underlying version of ICU.
- """
- # Note: either tokenisation is fine, because Postgres actually splits
- # words itself afterwards.
- self.assertIn(
- _parse_words_with_icu("lazy'fox jumped:over the.dog"),
- (
- # ICU 66 on Ubuntu 20.04
- ["lazy'fox", "jumped", "over", "the", "dog"],
- # ICU 70 on Ubuntu 22.04
- ["lazy'fox", "jumped:over", "the.dog"],
- # pyicu 2.10.2 on Alpine edge / macOS
- ["lazy'fox", "jumped", "over", "the.dog"],
- ),
- )
- def test_regex_word_boundary_punctuation(self) -> None:
- """
- Tests the behaviour of punctuation with the non-ICU tokeniser
- """
- self.assertEqual(
- _parse_words_with_regex("lazy'fox jumped:over the.dog"),
- ["lazy", "fox", "jumped", "over", "the", "dog"],
- )
|