Browse Source

Merge pull request #224 from matrix-org/anoa/lookup_perf

Improve performance of v2 lookup with temporary table
David Baker 4 years ago
parent
commit
ab6c25548c
3 changed files with 48 additions and 19 deletions
  1. 1 0
      CHANGELOG.md
  2. 46 14
      sydent/db/threepid_associations.py
  3. 1 5
      sydent/http/servlets/lookupv2servlet.py

+ 1 - 0
CHANGELOG.md

@@ -14,6 +14,7 @@ Unreleased changes
  * Fix logging so Sydent doesn't log 3PIDs when processing lookup requests
  * Fix incorrect HTTP response from `/3pid/getValidated3pid` endpoint on
    failure. [#216](https://github.com/matrix-org/sydent/pull/216)
+ * Improve performance of hashed lookups
 
 
 Changes in [1.0.3](https://github.com/matrix-org/sydent/releases/tag/v1.0.3) (2019-05-03)

+ 46 - 14
sydent/db/threepid_associations.py

@@ -227,22 +227,54 @@ class GlobalAssociationStore:
         )
         self.sydent.db.commit()
 
-    def retrieveMxidFromHash(self, lookup_hash):
-        """Returns an mxid from a given lookup_hash value
+    def retrieveMxidsForHashes(self, addresses):
+        """Returns a mapping from hash: mxid from a list of given lookup_hash values
 
-        :param input_hash: The lookup_hash value to lookup in the database
-        :type input_hash: str
+        :param addresses: An array of lookup_hash values to check against the db
+        :type addresses: list[str]
 
-        :returns the mxid relating to the lookup_hash value if found,
-                 otherwise None
-        :rtype: str|None
+        :returns a dictionary of lookup_hash values to mxids of all discovered matches
+        :rtype: dict[str, str]
         """
         cur = self.sydent.db.cursor()
 
-        res = cur.execute(
-            "SELECT mxid FROM global_threepid_associations WHERE lookup_hash = ?", (lookup_hash,)
-        )
-        row = res.fetchone()
-        if not row:
-            return None
-        return row[0]
+        cur.execute("CREATE TEMPORARY TABLE tmp_retrieve_mxids_for_hashes "
+                    "(lookup_hash VARCHAR)")
+        cur.execute("CREATE INDEX tmp_retrieve_mxids_for_hashes_lookup_hash ON "
+                    "tmp_retrieve_mxids_for_hashes(lookup_hash)")
+
+        results = {}
+        try:
+            # Convert list of addresses to list of tuples of addresses
+            addresses = [(x,) for x in addresses]
+
+            inserted_cap = 0
+            while inserted_cap < len(addresses):
+                cur.executemany(
+                    "INSERT INTO tmp_retrieve_mxids_for_hashes(lookup_hash) "
+                    "VALUES (?)",
+                    addresses[inserted_cap:inserted_cap + 500]
+                )
+                inserted_cap += 500
+
+            res = cur.execute(
+                # 'notBefore' is the time the association starts being valid, 'notAfter' the the time at which
+                # it ceases to be valid, so the ts must be greater than 'notBefore' and less than 'notAfter'.
+                "SELECT gta.lookup_hash, gta.mxid FROM global_threepid_associations gta "
+                "JOIN tmp_retrieve_mxids_for_hashes "
+                "ON gta.lookup_hash = tmp_retrieve_mxids_for_hashes.lookup_hash "
+                "WHERE gta.notBefore < ? AND gta.notAfter > ? "
+                "ORDER BY gta.lookup_hash, gta.mxid, gta.ts",
+                (time_msec(), time_msec())
+            )
+
+            # Place the results from the query into a dictionary
+            # Results are sorted from oldest to newest, so if there are multiple mxid's for
+            # the same lookup hash, only the newest mapping will be returned
+            for lookup_hash, mxid in res.fetchall():
+                results[lookup_hash] = mxid
+
+        finally:
+            cur.execute("DROP TABLE tmp_retrieve_mxids_for_hashes")
+
+        return results

+ 1 - 5
sydent/http/servlets/lookupv2servlet.py

@@ -121,11 +121,7 @@ class LookupV2Servlet(Resource):
 
         elif algorithm == "sha256":
             # Lookup using SHA256 with URL-safe base64 encoding
-            mappings = {}
-            for h in addresses:
-                mxid = self.globalAssociationStore.retrieveMxidFromHash(h)
-                if mxid:
-                    mappings[h] = mxid
+            mappings = self.globalAssociationStore.retrieveMxidsForHashes(addresses)
 
             return {'mappings': mappings}