SiteStorage.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. import os
  2. import re
  3. import shutil
  4. import json
  5. import time
  6. import sys
  7. import sqlite3
  8. import gevent.event
  9. from Db import Db
  10. from Debug import Debug
  11. from Config import config
  12. from util import helper
  13. from Plugin import PluginManager
  14. from Translate import translate as _
  15. @PluginManager.acceptPlugins
  16. class SiteStorage(object):
  17. def __init__(self, site, allow_create=True):
  18. self.site = site
  19. self.directory = u"%s/%s" % (config.data_dir, self.site.address) # Site data diretory
  20. self.allowed_dir = os.path.abspath(self.directory) # Only serve file within this dir
  21. self.log = site.log
  22. self.db = None # Db class
  23. self.db_checked = False # Checked db tables since startup
  24. self.event_db_busy = None # Gevent AsyncResult if db is working on rebuild
  25. self.has_db = self.isFile("dbschema.json") # The site has schema
  26. if not os.path.isdir(self.directory):
  27. if allow_create:
  28. os.mkdir(self.directory) # Create directory if not found
  29. else:
  30. raise Exception("Directory not exists: %s" % self.directory)
  31. # Load db from dbschema.json
  32. def openDb(self, check=True):
  33. try:
  34. schema = self.loadJson("dbschema.json")
  35. db_path = self.getPath(schema["db_file"])
  36. except Exception, err:
  37. raise Exception("dbschema.json is not a valid JSON: %s" % err)
  38. if check:
  39. if not os.path.isfile(db_path) or os.path.getsize(db_path) == 0: # Not exist or null
  40. self.rebuildDb()
  41. if not self.db:
  42. self.db = Db(schema, db_path)
  43. if check and not self.db_checked:
  44. changed_tables = self.db.checkTables()
  45. if changed_tables:
  46. self.rebuildDb(delete_db=False) # TODO: only update the changed table datas
  47. def closeDb(self):
  48. if self.db:
  49. self.db.close()
  50. self.event_db_busy = None
  51. self.db = None
  52. # Return db class
  53. def getDb(self):
  54. if not self.db:
  55. self.log.debug("No database, waiting for dbschema.json...")
  56. self.site.needFile("dbschema.json", priority=3)
  57. self.has_db = self.isFile("dbschema.json") # Recheck if dbschema exist
  58. if self.has_db:
  59. self.openDb()
  60. return self.db
  61. def updateDbFile(self, inner_path, file=None, cur=None):
  62. path = self.getPath(inner_path)
  63. return self.getDb().updateJson(path, file, cur)
  64. # Return possible db files for the site
  65. def getDbFiles(self):
  66. for content_inner_path, content in self.site.content_manager.contents.iteritems():
  67. # content.json file itself
  68. if self.isFile(content_inner_path):
  69. yield content_inner_path, self.getPath(content_inner_path)
  70. else:
  71. self.log.error("[MISSING] %s" % content_inner_path)
  72. # Data files in content.json
  73. content_inner_path_dir = helper.getDirname(content_inner_path) # Content.json dir relative to site
  74. for file_relative_path in content.get("files", {}).keys() + content.get("files_optional", {}).keys():
  75. if not file_relative_path.endswith(".json") and not file_relative_path.endswith("json.gz"):
  76. continue # We only interesed in json files
  77. file_inner_path = content_inner_path_dir + file_relative_path # File Relative to site dir
  78. file_inner_path = file_inner_path.strip("/") # Strip leading /
  79. if self.isFile(file_inner_path):
  80. yield file_inner_path, self.getPath(file_inner_path)
  81. else:
  82. self.log.error("[MISSING] %s" % file_inner_path)
  83. # Rebuild sql cache
  84. def rebuildDb(self, delete_db=True):
  85. self.has_db = self.isFile("dbschema.json")
  86. if not self.has_db:
  87. return False
  88. self.event_db_busy = gevent.event.AsyncResult()
  89. schema = self.loadJson("dbschema.json")
  90. db_path = self.getPath(schema["db_file"])
  91. if os.path.isfile(db_path) and delete_db:
  92. if self.db:
  93. self.db.close() # Close db if open
  94. time.sleep(0.5)
  95. self.log.info("Deleting %s" % db_path)
  96. try:
  97. os.unlink(db_path)
  98. except Exception, err:
  99. self.log.error("Delete error: %s" % err)
  100. self.db = None
  101. self.openDb(check=False)
  102. self.log.info("Creating tables...")
  103. self.db.checkTables()
  104. self.log.info("Importing data...")
  105. cur = self.db.getCursor()
  106. cur.execute("BEGIN")
  107. cur.logging = False
  108. found = 0
  109. s = time.time()
  110. db_files = list(self.getDbFiles())
  111. try:
  112. if len(db_files) > 100:
  113. self.site.messageWebsocket(_["Database rebuilding...<br>Imported {0} of {1} files..."].format("0000", len(db_files)), "rebuild", 0)
  114. for file_inner_path, file_path in db_files:
  115. try:
  116. if self.updateDbFile(file_inner_path, file=open(file_path, "rb"), cur=cur):
  117. found += 1
  118. except Exception, err:
  119. self.log.error("Error importing %s: %s" % (file_inner_path, Debug.formatException(err)))
  120. if found and found % 100 == 0:
  121. self.site.messageWebsocket(
  122. _["Database rebuilding...<br>Imported {0} of {1} files..."].format(found, len(db_files)),
  123. "rebuild",
  124. int(float(found) / len(db_files) * 100)
  125. )
  126. finally:
  127. cur.execute("END")
  128. if len(db_files) > 100:
  129. self.site.messageWebsocket(_["Database rebuilding...<br>Imported {0} of {1} files..."].format(found, len(db_files)), "rebuild", 100)
  130. self.log.info("Imported %s data file in %ss" % (found, time.time() - s))
  131. self.event_db_busy.set(True) # Event done, notify waiters
  132. self.event_db_busy = None # Clear event
  133. # Execute sql query or rebuild on dberror
  134. def query(self, query, params=None):
  135. if self.event_db_busy: # Db not ready for queries
  136. self.log.debug("Wating for db...")
  137. self.event_db_busy.get() # Wait for event
  138. try:
  139. res = self.getDb().execute(query, params)
  140. except sqlite3.DatabaseError, err:
  141. if err.__class__.__name__ == "DatabaseError":
  142. self.log.error("Database error: %s, query: %s, try to rebuilding it..." % (err, query))
  143. self.rebuildDb()
  144. res = self.db.cur.execute(query, params)
  145. else:
  146. raise err
  147. return res
  148. # Open file object
  149. def open(self, inner_path, mode="rb", create_dirs=False):
  150. file_path = self.getPath(inner_path)
  151. if create_dirs:
  152. file_dir = os.path.dirname(file_path)
  153. if not os.path.isdir(file_dir):
  154. os.makedirs(file_dir)
  155. return open(file_path, mode)
  156. # Open file object
  157. def read(self, inner_path, mode="r"):
  158. return open(self.getPath(inner_path), mode).read()
  159. # Write content to file
  160. def write(self, inner_path, content):
  161. file_path = self.getPath(inner_path)
  162. # Create dir if not exist
  163. file_dir = os.path.dirname(file_path)
  164. if not os.path.isdir(file_dir):
  165. os.makedirs(file_dir)
  166. # Write file
  167. if hasattr(content, 'read'): # File-like object
  168. with open(file_path, "wb") as file:
  169. shutil.copyfileobj(content, file) # Write buff to disk
  170. else: # Simple string
  171. if inner_path == "content.json" and os.path.isfile(file_path):
  172. helper.atomicWrite(file_path, content)
  173. else:
  174. with open(file_path, "wb") as file:
  175. file.write(content)
  176. del content
  177. self.onUpdated(inner_path)
  178. # Remove file from filesystem
  179. def delete(self, inner_path):
  180. file_path = self.getPath(inner_path)
  181. os.unlink(file_path)
  182. self.onUpdated(inner_path, file=False)
  183. def deleteDir(self, inner_path):
  184. dir_path = self.getPath(inner_path)
  185. os.rmdir(dir_path)
  186. def rename(self, inner_path_before, inner_path_after):
  187. for retry in range(3):
  188. # To workaround "The process cannot access the file beacause it is being used by another process." error
  189. try:
  190. os.rename(self.getPath(inner_path_before), self.getPath(inner_path_after))
  191. err = None
  192. break
  193. except Exception, err:
  194. self.log.error("%s rename error: %s (retry #%s)" % (inner_path_before, err, retry))
  195. time.sleep(0.1 + retry)
  196. if err:
  197. raise err
  198. # List files from a directory
  199. def walk(self, dir_inner_path):
  200. directory = self.getPath(dir_inner_path)
  201. for root, dirs, files in os.walk(directory):
  202. root = root.replace("\\", "/")
  203. root_relative_path = re.sub("^%s" % re.escape(directory), "", root).lstrip("/")
  204. for file_name in files:
  205. if root_relative_path: # Not root dir
  206. yield root_relative_path + "/" + file_name
  207. else:
  208. yield file_name
  209. # list directories in a directory
  210. def list(self, dir_inner_path):
  211. directory = self.getPath(dir_inner_path)
  212. return os.listdir(directory)
  213. # Site content updated
  214. def onUpdated(self, inner_path, file=None):
  215. # Update Sql cache
  216. if inner_path == "dbschema.json":
  217. self.has_db = self.isFile("dbschema.json")
  218. # Reopen DB to check changes
  219. if self.has_db:
  220. self.closeDb()
  221. self.openDb()
  222. elif not config.disable_db and (inner_path.endswith(".json") or inner_path.endswith(".json.gz")) and self.has_db: # Load json file to db
  223. if config.verbose:
  224. self.log.debug("Loading json file to db: %s (file: %s)" % (inner_path, file))
  225. try:
  226. self.updateDbFile(inner_path, file)
  227. except Exception, err:
  228. self.log.error("Json %s load error: %s" % (inner_path, Debug.formatException(err)))
  229. self.closeDb()
  230. # Load and parse json file
  231. def loadJson(self, inner_path):
  232. with self.open(inner_path) as file:
  233. return json.load(file)
  234. # Write formatted json file
  235. def writeJson(self, inner_path, data):
  236. content = json.dumps(data, indent=1, sort_keys=True)
  237. # Make it a little more compact by removing unnecessary white space
  238. def compact_dict(match):
  239. if "\n" in match.group(0):
  240. return match.group(0).replace(match.group(1), match.group(1).strip())
  241. else:
  242. return match.group(0)
  243. content = re.sub("\{(\n[^,\[\{]{10,100}?)\}[, ]{0,2}\n", compact_dict, content, flags=re.DOTALL)
  244. def compact_list(match):
  245. if "\n" in match.group(0):
  246. stripped_lines = re.sub("\n[ ]*", "", match.group(1))
  247. return match.group(0).replace(match.group(1), stripped_lines)
  248. else:
  249. return match.group(0)
  250. content = re.sub("\[([^\[\{]{2,300}?)\][, ]{0,2}\n", compact_list, content, flags=re.DOTALL)
  251. # Remove end of line whitespace
  252. content = re.sub("(?m)[ ]+$", "", content)
  253. # Write to disk
  254. self.write(inner_path, content)
  255. # Get file size
  256. def getSize(self, inner_path):
  257. path = self.getPath(inner_path)
  258. try:
  259. return os.path.getsize(path)
  260. except:
  261. return 0
  262. # File exist
  263. def isFile(self, inner_path):
  264. return os.path.isfile(self.getPath(inner_path))
  265. # File or directory exist
  266. def isExists(self, inner_path):
  267. return os.path.exists(self.getPath(inner_path))
  268. # Dir exist
  269. def isDir(self, inner_path):
  270. return os.path.isdir(self.getPath(inner_path))
  271. # Security check and return path of site's file
  272. def getPath(self, inner_path):
  273. inner_path = inner_path.replace("\\", "/") # Windows separator fix
  274. if not inner_path:
  275. return self.directory
  276. if ".." in inner_path:
  277. raise Exception(u"File not allowed: %s" % inner_path)
  278. return u"%s/%s" % (self.directory, inner_path)
  279. # Get site dir relative path
  280. def getInnerPath(self, path):
  281. if path == self.directory:
  282. inner_path = ""
  283. else:
  284. if path.startswith(self.directory):
  285. inner_path = path[len(self.directory)+1:]
  286. else:
  287. raise Exception(u"File not allowed: %s" % path)
  288. return inner_path
  289. # Verify all files sha512sum using content.json
  290. def verifyFiles(self, quick_check=False, add_optional=False, add_changed=True):
  291. bad_files = []
  292. i = 0
  293. if not self.site.content_manager.contents.get("content.json"): # No content.json, download it first
  294. self.log.debug("VerifyFile content.json not exists")
  295. self.site.needFile("content.json", update=True) # Force update to fix corrupt file
  296. self.site.content_manager.loadContent() # Reload content.json
  297. for content_inner_path, content in self.site.content_manager.contents.items():
  298. i += 1
  299. if i % 50 == 0:
  300. time.sleep(0.0001) # Context switch to avoid gevent hangs
  301. if not os.path.isfile(self.getPath(content_inner_path)): # Missing content.json file
  302. self.log.debug("[MISSING] %s" % content_inner_path)
  303. bad_files.append(content_inner_path)
  304. for file_relative_path in content.get("files", {}).keys():
  305. file_inner_path = helper.getDirname(content_inner_path) + file_relative_path # Relative to site dir
  306. file_inner_path = file_inner_path.strip("/") # Strip leading /
  307. file_path = self.getPath(file_inner_path)
  308. if not os.path.isfile(file_path):
  309. self.log.debug("[MISSING] %s" % file_inner_path)
  310. bad_files.append(file_inner_path)
  311. continue
  312. if quick_check:
  313. ok = os.path.getsize(file_path) == content["files"][file_relative_path]["size"]
  314. if not ok:
  315. err = "Invalid size"
  316. else:
  317. try:
  318. ok = self.site.content_manager.verifyFile(file_inner_path, open(file_path, "rb"))
  319. except Exception, err:
  320. ok = False
  321. if not ok:
  322. self.log.debug("[INVALID] %s: %s" % (file_inner_path, err))
  323. if add_changed or content.get("cert_user_id"): # If updating own site only add changed user files
  324. bad_files.append(file_inner_path)
  325. # Optional files
  326. optional_added = 0
  327. optional_removed = 0
  328. for file_relative_path in content.get("files_optional", {}).keys():
  329. file_node = content["files_optional"][file_relative_path]
  330. file_inner_path = helper.getDirname(content_inner_path) + file_relative_path # Relative to site dir
  331. file_inner_path = file_inner_path.strip("/") # Strip leading /
  332. file_path = self.getPath(file_inner_path)
  333. if not os.path.isfile(file_path):
  334. if self.site.content_manager.hashfield.hasHash(file_node["sha512"]):
  335. self.site.content_manager.optionalRemove(file_inner_path, file_node["sha512"], file_node["size"])
  336. if add_optional:
  337. bad_files.append(file_inner_path)
  338. continue
  339. if quick_check:
  340. ok = os.path.getsize(file_path) == content["files_optional"][file_relative_path]["size"]
  341. else:
  342. try:
  343. ok = self.site.content_manager.verifyFile(file_inner_path, open(file_path, "rb"))
  344. except Exception, err:
  345. ok = False
  346. if ok:
  347. if not self.site.content_manager.hashfield.hasHash(file_node["sha512"]):
  348. self.site.content_manager.optionalDownloaded(file_inner_path, file_node["sha512"], file_node["size"])
  349. optional_added += 1
  350. else:
  351. if self.site.content_manager.hashfield.hasHash(file_node["sha512"]):
  352. self.site.content_manager.optionalRemove(file_inner_path, file_node["sha512"], file_node["size"])
  353. optional_removed += 1
  354. bad_files.append(file_inner_path)
  355. self.log.debug("[OPTIONAL CHANGED] %s" % file_inner_path)
  356. if config.verbose:
  357. self.log.debug(
  358. "%s verified: %s, quick: %s, optionals: +%s -%s" %
  359. (content_inner_path, len(content["files"]), quick_check, optional_added, optional_removed)
  360. )
  361. time.sleep(0.0001) # Context switch to avoid gevent hangs
  362. return bad_files
  363. # Check and try to fix site files integrity
  364. def updateBadFiles(self, quick_check=True):
  365. s = time.time()
  366. bad_files = self.verifyFiles(
  367. quick_check,
  368. add_optional=self.site.isDownloadable(""),
  369. add_changed=not self.site.settings.get("own") # Don't overwrite changed files if site owned
  370. )
  371. self.site.bad_files = {}
  372. if bad_files:
  373. for bad_file in bad_files:
  374. self.site.bad_files[bad_file] = 1
  375. self.log.debug("Checked files in %.2fs... Found bad files: %s, Quick:%s" % (time.time() - s, len(bad_files), quick_check))
  376. # Delete site's all file
  377. def deleteFiles(self):
  378. self.log.debug("Deleting files from content.json...")
  379. files = [] # Get filenames
  380. for content_inner_path in self.site.content_manager.contents.keys():
  381. content = self.site.content_manager.contents[content_inner_path]
  382. files.append(content_inner_path)
  383. # Add normal files
  384. for file_relative_path in content.get("files", {}).keys():
  385. file_inner_path = helper.getDirname(content_inner_path) + file_relative_path # Relative to site dir
  386. files.append(file_inner_path)
  387. # Add optional files
  388. for file_relative_path in content.get("files_optional", {}).keys():
  389. file_inner_path = helper.getDirname(content_inner_path) + file_relative_path # Relative to site dir
  390. files.append(file_inner_path)
  391. if self.isFile("dbschema.json"):
  392. self.log.debug("Deleting db file...")
  393. self.closeDb()
  394. self.has_db = False
  395. try:
  396. schema = self.loadJson("dbschema.json")
  397. db_path = self.getPath(schema["db_file"])
  398. if os.path.isfile(db_path):
  399. os.unlink(db_path)
  400. except Exception, err:
  401. self.log.error("Db file delete error: %s" % err)
  402. for inner_path in files:
  403. path = self.getPath(inner_path)
  404. if os.path.isfile(path):
  405. for retry in range(5):
  406. try:
  407. os.unlink(path)
  408. break
  409. except Exception, err:
  410. self.log.error("Error removing %s: %s, try #%s" % (path, err, retry))
  411. time.sleep(float(retry) / 10)
  412. self.onUpdated(inner_path, False)
  413. self.log.debug("Deleting empty dirs...")
  414. for root, dirs, files in os.walk(self.directory, topdown=False):
  415. for dir in dirs:
  416. path = os.path.join(root, dir)
  417. if os.path.isdir(path) and os.listdir(path) == []:
  418. os.removedirs(path)
  419. self.log.debug("Removing %s" % path)
  420. if os.path.isdir(self.directory) and os.listdir(self.directory) == []:
  421. os.removedirs(self.directory) # Remove sites directory if empty
  422. if os.path.isdir(self.directory):
  423. self.log.debug("Some unknown file remained in site data dir: %s..." % self.directory)
  424. return False # Some files not deleted
  425. else:
  426. self.log.debug("Site data directory deleted: %s..." % self.directory)
  427. return True # All clean