loader.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572
  1. #
  2. # Metrix++, Copyright 2009-2013, Metrix++ Project
  3. # Link: http://metrixplusplus.sourceforge.net
  4. #
  5. # This file is a part of Metrix++ Tool.
  6. #
  7. # Metrix++ is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, version 3 of the License.
  10. #
  11. # Metrix++ is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with Metrix++. If not, see <http://www.gnu.org/licenses/>.
  18. #
  19. import logging
  20. import os.path
  21. import core.api
  22. import core.db.sqlite
  23. ####################################
  24. # Data Interface
  25. ####################################
  26. from core.api import Data, FileRegionData, Marker, FileData
  27. class AggregatedData(Data):
  28. def __init__(self, loader, path):
  29. Data.__init__(self)
  30. self.path = path
  31. self.loader = loader
  32. self.subdirs = None
  33. self.subfiles = None
  34. def get_subdirs(self):
  35. if self.subdirs != None:
  36. return self.subdirs
  37. self.subdirs = []
  38. if self.path != None:
  39. for subdir in self.loader.db.iterate_dircontent(self.path, include_subdirs = True, include_subfiles = False):
  40. self.subdirs.append(subdir)
  41. return self.subdirs
  42. def get_subfiles(self):
  43. if self.subfiles != None:
  44. return self.subfiles
  45. self.subfiles = []
  46. if self.path != None:
  47. for subfile in self.loader.db.iterate_dircontent(self.path, include_subdirs = False, include_subfiles = True):
  48. self.subfiles.append(subfile)
  49. return self.subfiles
  50. class SelectData(Data):
  51. def __init__(self, loader, path, file_id, region_id):
  52. Data.__init__(self)
  53. self.loader = loader
  54. self.path = path
  55. self.file_id = file_id
  56. self.region_id = region_id
  57. self.region = None
  58. def get_path(self):
  59. return self.path
  60. def get_region(self):
  61. if self.region == None and self.region_id != None:
  62. row = self.loader.db.get_region(self.file_id, self.region_id)
  63. if row != None:
  64. self.region = FileRegionData(self.loader,
  65. self.file_id,
  66. self.region_id,
  67. row.name,
  68. row.begin,
  69. row.end,
  70. row.line_begin,
  71. row.line_end,
  72. row.cursor,
  73. row.group,
  74. row.checksum)
  75. return self.region
  76. class DiffData(Data):
  77. def __init__(self, new_data, old_data):
  78. Data.__init__(self)
  79. self.new_data = new_data
  80. self.old_data = old_data
  81. def get_data(self, namespace, field):
  82. new_data = self.new_data.get_data(namespace, field)
  83. old_data = self.old_data.get_data(namespace, field)
  84. if new_data == None:
  85. return None
  86. if old_data == None:
  87. # non_zero fields has got zero value by default if missed
  88. # the data can be also unavailable,
  89. # because previous collection does not include that
  90. # but external tools (like limit.py) should warn about this,
  91. # using list of registered database properties
  92. old_data = 0
  93. return new_data - old_data
  94. ####################################
  95. # Packager Interface
  96. ####################################
  97. class PackagerError(Exception):
  98. def __init__(self):
  99. Exception.__init__(self, "Failed to pack or unpack.")
  100. class PackagerFactory(object):
  101. def create(self, python_type, non_zero):
  102. if python_type == None:
  103. return PackagerFactory.SkipPackager()
  104. if python_type == int:
  105. if non_zero == False:
  106. return PackagerFactory.IntPackager()
  107. else:
  108. return PackagerFactory.IntNonZeroPackager()
  109. if python_type == float and non_zero == False:
  110. return PackagerFactory.FloatPackager()
  111. if python_type == str:
  112. return PackagerFactory.StringPackager()
  113. class PackagerFactoryError(Exception):
  114. def __init__(self, python_type):
  115. Exception.__init__(self, "Python type '" + str(python_type) + "' is not supported by the factory.")
  116. raise PackagerFactoryError(python_type)
  117. def get_python_type(self, sql_type):
  118. if sql_type == "integer":
  119. return int
  120. if sql_type == "real":
  121. return float
  122. if sql_type == "text":
  123. return str
  124. class PackagerFactoryError(Exception):
  125. def __init__(self, sql_type):
  126. Exception.__init__(self, "SQL type '" + str(sql_type) + "' is not supported by the factory.")
  127. raise PackagerFactoryError(sql_type)
  128. class IPackager(object):
  129. def pack(self, unpacked_data):
  130. raise core.api.InterfaceNotImplemented(self)
  131. def unpack(self, packed_data):
  132. raise core.api.InterfaceNotImplemented(self)
  133. def get_sql_type(self):
  134. raise core.api.InterfaceNotImplemented(self)
  135. def get_python_type(self):
  136. raise core.api.InterfaceNotImplemented(self)
  137. def is_non_zero(self):
  138. return False
  139. class IntPackager(IPackager):
  140. def pack(self, unpacked_data):
  141. if not isinstance(unpacked_data, int):
  142. raise PackagerError()
  143. return str(unpacked_data)
  144. def unpack(self, packed_data):
  145. try:
  146. return int(packed_data)
  147. except ValueError:
  148. raise PackagerError()
  149. def get_sql_type(self):
  150. return "integer"
  151. def get_python_type(self):
  152. return int
  153. class IntNonZeroPackager(IntPackager):
  154. def pack(self, unpacked_data):
  155. if unpacked_data == 0:
  156. raise PackagerError()
  157. return PackagerFactory.IntPackager.pack(self, unpacked_data)
  158. def is_non_zero(self):
  159. return True
  160. class FloatPackager(IPackager):
  161. def pack(self, unpacked_data):
  162. if not isinstance(unpacked_data, float):
  163. raise PackagerError()
  164. return str(unpacked_data)
  165. def unpack(self, packed_data):
  166. try:
  167. return float(packed_data)
  168. except ValueError:
  169. raise PackagerError()
  170. def get_sql_type(self):
  171. return "real"
  172. def get_python_type(self):
  173. return float
  174. class FloatNonZeroPackager(FloatPackager):
  175. def pack(self, unpacked_data):
  176. if unpacked_data == 0:
  177. raise PackagerError()
  178. return PackagerFactory.FloatPackager.pack(self, unpacked_data)
  179. def is_non_zero(self):
  180. return True
  181. class StringPackager(IPackager):
  182. def pack(self, unpacked_data):
  183. if not isinstance(unpacked_data, str):
  184. raise PackagerError()
  185. return str(unpacked_data)
  186. def unpack(self, packed_data):
  187. try:
  188. return str(packed_data)
  189. except ValueError:
  190. raise PackagerError()
  191. def get_sql_type(self):
  192. return "text"
  193. def get_python_type(self):
  194. return str
  195. class SkipPackager(IPackager):
  196. def pack(self, unpacked_data):
  197. return None
  198. def unpack(self, packed_data):
  199. return None
  200. def get_sql_type(self):
  201. return None
  202. def get_python_type(self):
  203. return None
  204. ####################################
  205. # Loader
  206. ####################################
  207. class NamespaceError(Exception):
  208. def __init__(self, namespace, reason):
  209. Exception.__init__(self, "Namespace '"
  210. + namespace
  211. + "': '"
  212. + reason
  213. + "'")
  214. class FieldError(Exception):
  215. def __init__(self, field, reason):
  216. Exception.__init__(self, "Field '"
  217. + field
  218. + "': '"
  219. + reason
  220. + "'")
  221. class Namespace(object):
  222. def __init__(self, db_handle, name, support_regions = False, version='1.0'):
  223. if not isinstance(name, str):
  224. raise NamespaceError(name, "name not a string")
  225. self.name = name
  226. self.support_regions = support_regions
  227. self.fields = {}
  228. self.db = db_handle
  229. if self.db.check_table(name) == False:
  230. self.db.create_table(name, support_regions, version)
  231. else:
  232. for column in self.db.iterate_columns(name):
  233. self.add_field(column.name, PackagerFactory().get_python_type(column.sql_type), non_zero=column.non_zero)
  234. def get_name(self):
  235. return self.name
  236. def are_regions_supported(self):
  237. return self.support_regions
  238. def add_field(self, field_name, python_type, non_zero=False):
  239. if not isinstance(field_name, str):
  240. raise FieldError(field_name, "field_name not a string")
  241. packager = PackagerFactory().create(python_type, non_zero)
  242. if field_name in self.fields.keys():
  243. raise FieldError(field_name, "double used")
  244. self.fields[field_name] = packager
  245. if self.db.check_column(self.get_name(), field_name) == False:
  246. # - False if cloned
  247. # - True if created
  248. return self.db.create_column(self.name, field_name, packager.get_sql_type(), non_zero=non_zero)
  249. return None # if double request
  250. def iterate_field_names(self):
  251. for name in self.fields.keys():
  252. yield name
  253. def get_field_packager(self, field_name):
  254. if field_name in self.fields.keys():
  255. return self.fields[field_name]
  256. else:
  257. return None
  258. def get_field_sql_type(self, field_name):
  259. return self.get_field_packager(field_name).get_sql_type()
  260. def get_field_python_type(self, field_name):
  261. return self.get_field_packager(field_name).get_python_type()
  262. class DataNotPackable(Exception):
  263. def __init__(self, namespace, field, value, packager, extra_message):
  264. Exception.__init__(self, "Data '"
  265. + str(value)
  266. + "' of type "
  267. + str(value.__class__)
  268. + " referred by '"
  269. + namespace
  270. + "=>"
  271. + field
  272. + "' is not packable by registered packager '"
  273. + str(packager.__class__)
  274. + "': " + extra_message)
  275. class Loader(object):
  276. def __init__(self):
  277. self.namespaces = {}
  278. self.db = None
  279. self.last_file_data = None # for performance boost reasons
  280. def create_database(self, dbfile, previous_db = None):
  281. self.db = core.db.sqlite.Database()
  282. if os.path.exists(dbfile):
  283. logging.warn("Removing existing file: " + dbfile)
  284. # TODO can reuse existing db file to speed up the processing?
  285. # TODO add option to choose to remove or to overwrite?
  286. os.unlink(dbfile)
  287. if previous_db != None and os.path.exists(previous_db) == False:
  288. raise core.api.ExitError(None, "Database file '" + previous_db + "' does not exist")
  289. self.db.create(dbfile, clone_from=previous_db)
  290. def open_database(self, dbfile, read_only = True):
  291. self.db = core.db.sqlite.Database()
  292. if os.path.exists(dbfile) == False:
  293. raise core.api.ExitError(None, "Database file '" + dbfile + "' does not exist")
  294. self.db.connect(dbfile, read_only=read_only)
  295. for table in self.db.iterate_tables():
  296. self.create_namespace(table.name, table.support_regions)
  297. def set_property(self, property_name, value):
  298. if self.db == None:
  299. return None
  300. return self.db.set_property(property_name, value)
  301. def get_property(self, property_name):
  302. if self.db == None:
  303. return None
  304. return self.db.get_property(property_name)
  305. def iterate_properties(self):
  306. if self.db == None:
  307. return None
  308. return self.db.iterate_properties()
  309. def create_namespace(self, name, support_regions = False, version='1.0'):
  310. if self.db == None:
  311. return None
  312. if name in self.namespaces.keys():
  313. raise NamespaceError(name, "double used")
  314. new_namespace = Namespace(self.db, name, support_regions, version)
  315. self.namespaces[name] = new_namespace
  316. return new_namespace
  317. def iterate_namespace_names(self):
  318. for name in self.namespaces.keys():
  319. yield name
  320. def get_namespace(self, name):
  321. if name in self.namespaces.keys():
  322. return self.namespaces[name]
  323. else:
  324. return None
  325. def create_file_data(self, path, checksum, content):
  326. if self.db == None:
  327. return None
  328. (new_id, is_updated) = self.db.create_file(path, checksum)
  329. result = FileData(self, path, new_id, checksum, content)
  330. self.last_file_data = result
  331. return (result, is_updated)
  332. def load_file_data(self, path):
  333. if self.db == None:
  334. return None
  335. if self.last_file_data != None and self.last_file_data.get_path() == path:
  336. return self.last_file_data
  337. data = self.db.get_file(path)
  338. if data == None:
  339. return None
  340. result = FileData(self, data.path, data.id, data.checksum, None)
  341. self.last_file_data = result
  342. return result
  343. def save_file_data(self, file_data):
  344. if self.db == None:
  345. return None
  346. class DataIterator(object):
  347. def iterate_packed_values(self, data, namespace, support_regions = False):
  348. for each in data.iterate_fields(namespace):
  349. space = self.loader.get_namespace(namespace)
  350. if space == None:
  351. raise DataNotPackable(namespace, each[0], each[1], None, "The namespace has not been found")
  352. packager = space.get_field_packager(each[0])
  353. if packager == None:
  354. raise DataNotPackable(namespace, each[0], each[1], None, "The field has not been found")
  355. if space.support_regions != support_regions:
  356. raise DataNotPackable(namespace, each[0], each[1], packager, "Incompatible support for regions")
  357. try:
  358. packed_data = packager.pack(each[1])
  359. if packed_data == None:
  360. continue
  361. except PackagerError:
  362. raise DataNotPackable(namespace, each[0], each[1], packager, "Packager raised exception")
  363. yield (each[0], packed_data)
  364. def __init__(self, loader, data, namespace, support_regions = False):
  365. self.loader = loader
  366. self.iterator = self.iterate_packed_values(data, namespace, support_regions)
  367. def __iter__(self):
  368. return self.iterator
  369. for namespace in file_data.iterate_namespaces():
  370. if file_data.is_namespace_updated(namespace) == False:
  371. continue
  372. self.db.add_row(namespace,
  373. file_data.get_id(),
  374. None,
  375. DataIterator(self, file_data, namespace))
  376. if file_data.are_regions_loaded():
  377. for region in file_data.iterate_regions():
  378. for namespace in region.iterate_namespaces():
  379. if region.is_namespace_updated(namespace) == False:
  380. continue
  381. self.db.add_row(namespace,
  382. file_data.get_id(),
  383. region.get_id(),
  384. DataIterator(self, region, namespace, support_regions = True))
  385. def iterate_file_data(self, path = None, path_like_filter = "%"):
  386. if self.db == None:
  387. return None
  388. final_path_like = path_like_filter
  389. if path != None:
  390. if self.db.check_dir(path) == False and self.db.check_file(path) == False:
  391. return None
  392. final_path_like = path + path_like_filter
  393. class FileDataIterator(object):
  394. def iterate_file_data(self, loader, final_path_like):
  395. for data in loader.db.iterate_files(path_like=final_path_like):
  396. yield FileData(loader, data.path, data.id, data.checksum, None)
  397. def __init__(self, loader, final_path_like):
  398. self.iterator = self.iterate_file_data(loader, final_path_like)
  399. def __iter__(self):
  400. return self.iterator
  401. if self.db == None:
  402. return None
  403. return FileDataIterator(self, final_path_like)
  404. def load_aggregated_data(self, path = None, path_like_filter = "%", namespaces = None):
  405. if self.db == None:
  406. return None
  407. final_path_like = path_like_filter
  408. if path != None:
  409. if self.db.check_dir(path) == False and self.db.check_file(path) == False:
  410. return None
  411. final_path_like = path + path_like_filter
  412. if namespaces == None:
  413. namespaces = self.namespaces.keys()
  414. result = AggregatedData(self, path)
  415. for name in namespaces:
  416. namespace = self.get_namespace(name)
  417. data = self.db.aggregate_rows(name, path_like = final_path_like)
  418. for field in data.keys():
  419. if namespace.get_field_packager(field).get_python_type() == str:
  420. continue
  421. if namespace.get_field_packager(field).is_non_zero() == True:
  422. data[field]['min'] = None
  423. data[field]['avg'] = None
  424. distribution = self.db.count_rows(name, path_like = final_path_like, group_by_column = field)
  425. data[field]['distribution-bars'] = []
  426. for each in distribution:
  427. if each[0] == None:
  428. continue
  429. assert(float(data[field]['count'] != 0))
  430. data[field]['distribution-bars'].append({'metric': each[0],
  431. 'count': each[1],
  432. 'ratio': round((float(each[1]) / float(data[field]['count'])), 4)})
  433. result.set_data(name, field, data[field])
  434. return result
  435. def load_selected_data(self, namespace, fields = None, path = None, path_like_filter = "%", filters = [],
  436. sort_by = None, limit_by = None):
  437. if self.db == None:
  438. return None
  439. final_path_like = path_like_filter
  440. if path != None:
  441. if self.db.check_dir(path) == False and self.db.check_file(path) == False:
  442. return None
  443. final_path_like = path + path_like_filter
  444. namespace_obj = self.get_namespace(namespace)
  445. if namespace_obj == None:
  446. return None
  447. class SelectDataIterator(object):
  448. def iterate_selected_values(self, loader, namespace_obj, final_path_like, fields, filters, sort_by, limit_by):
  449. for row in loader.db.select_rows(namespace_obj.get_name(), path_like=final_path_like, filters=filters,
  450. order_by=sort_by, limit_by=limit_by):
  451. region_id = None
  452. if namespace_obj.are_regions_supported() == True:
  453. region_id = row['region_id']
  454. data = SelectData(loader, row['path'], row['id'], region_id)
  455. field_names = fields
  456. if fields == None:
  457. field_names = namespace_obj.iterate_field_names()
  458. for field in field_names:
  459. data.set_data(namespace, field, row[field])
  460. yield data
  461. def __init__(self, loader, namespace_obj, final_path_like, fields, filters, sort_by, limit_by):
  462. self.iterator = self.iterate_selected_values(loader, namespace_obj, final_path_like, fields, filters, sort_by, limit_by)
  463. def __iter__(self):
  464. return self.iterator
  465. return SelectDataIterator(self, namespace_obj, final_path_like, fields, filters, sort_by, limit_by)