Package Gnumed :: Package pycommon :: Module gmMatchProvider
[frames] | no frames]

Source Code for Module Gnumed.pycommon.gmMatchProvider

  1  """Base classes for match providers. 
  2   
  3  They are used by business objects to give 
  4  phrasewheels the ability to guess phrases. 
  5   
  6  Copyright (C) GNUMed developers 
  7  license: GPL 
  8  """ 
  9  __version__ = "$Revision: 1.34 $" 
 10  __author__  = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>" 
 11   
 12  # std lib 
 13  import string, types, time, sys, re as regex, logging 
 14   
 15   
 16  # GNUmed 
 17  from Gnumed.pycommon import gmPG2 
 18   
 19   
 20  _log = logging.getLogger('gm.ui') 
 21  _log.info(__version__) 
 22   
 23   
 24  default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"' 
 25  default_word_separators = '[- \t=+&:@]+' 
 26  #============================================================ 
27 -class cMatchProvider(object):
28 """Base class for match providing objects. 29 30 Match sources might be: 31 - database tables 32 - flat files 33 - previous input 34 - config files 35 - in-memory list created on the fly 36 """ 37 print_queries = False 38 #--------------------------------------------------------
39 - def __init__(self):
40 self.setThresholds() 41 42 self._context_vals = {} 43 self.__ignored_chars = regex.compile(default_ignored_chars) 44 self.__word_separators = regex.compile(default_word_separators)
45 #-------------------------------------------------------- 46 # actions 47 #--------------------------------------------------------
48 - def getMatches(self, aFragment = None):
49 """Return matches according to aFragment and matching thresholds. 50 51 FIXME: design decision: we dont worry about data source changes 52 during the lifetime of a MatchProvider 53 FIXME: append _("*get all items*") on truncation 54 """ 55 # sanity check 56 if aFragment is None: 57 raise ValueError, 'Cannot find matches without a fragment.' 58 59 # user explicitly wants all matches 60 if aFragment == u'*': 61 return self.getAllMatches() 62 63 # case insensitivity 64 tmpFragment = aFragment.lower() 65 # remove ignored chars 66 if self.__ignored_chars is not None: 67 tmpFragment = self.__ignored_chars.sub('', tmpFragment) 68 # normalize word separators 69 if self.__word_separators is not None: 70 tmpFragment = u' '.join(self.__word_separators.split(tmpFragment)) 71 # length in number of significant characters only 72 lngFragment = len(tmpFragment) 73 74 # order is important ! 75 if lngFragment >= self.__threshold_substring: 76 return self.getMatchesBySubstr(tmpFragment) 77 elif lngFragment >= self.__threshold_word: 78 return self.getMatchesByWord(tmpFragment) 79 elif lngFragment >= self.__threshold_phrase: 80 return self.getMatchesByPhrase(tmpFragment) 81 else: 82 return (False, [])
83 #--------------------------------------------------------
84 - def getAllMatches(self):
85 raise NotImplementedError
86 #--------------------------------------------------------
87 - def getMatchesByPhrase(self, aFragment):
88 raise NotImplementedError
89 #--------------------------------------------------------
90 - def getMatchesByWord(self, aFragment):
91 raise NotImplementedError
92 #--------------------------------------------------------
93 - def getMatchesBySubstr(self, aFragment):
94 raise NotImplementedError
95 #-------------------------------------------------------- 96 # configuration 97 #--------------------------------------------------------
98 - def setThresholds(self, aPhrase = 1, aWord = 3, aSubstring = 5):
99 """Set match location thresholds. 100 101 - the fragment passed to getMatches() must contain at least this many 102 characters before it triggers a match search at: 103 1) phrase_start - start of phrase (first word) 104 2) word_start - start of any word within phrase 105 3) in_word - _inside_ any word within phrase 106 """ 107 # sanity checks 108 if aSubstring < aWord: 109 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word)) 110 return False 111 if aWord < aPhrase: 112 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase)) 113 return False 114 115 # now actually reassign thresholds 116 self.__threshold_phrase = aPhrase 117 self.__threshold_word = aWord 118 self.__threshold_substring = aSubstring 119 120 return True
121 #--------------------------------------------------------
122 - def _set_word_separators(self, word_separators=None):
123 if word_separators is None: 124 self.__word_separators = None 125 else: 126 self.__word_separators = regex.compile(word_separators)
127
128 - def _get_word_separators(self):
129 if self.__word_separators is None: 130 return None 131 return self.__word_separators.pattern
132 133 word_separators = property(_get_word_separators, _set_word_separators) 134 #--------------------------------------------------------
135 - def _set_ignored_chars(self, ignored_chars=None):
136 if ignored_chars is None: 137 self.__ignored_chars = None 138 else: 139 self.__ignored_chars = regex.compile(ignored_chars)
140
141 - def _get_ignored_chars(self):
142 if self.__ignored_chars is None: 143 return None 144 return self.__ignored_chars.pattern
145 146 ignored_chars = property(_get_ignored_chars, _set_ignored_chars) 147 #--------------------------------------------------------
148 - def set_context (self, context=None, val=None):
149 """Set value to provide context information for matches. 150 151 The matching code may ignore it depending on its exact 152 implementation. Names and values of the context depend 153 on what is being matched. 154 155 <context> -- the *placeholder* key *inside* the context 156 definition, not the context *definition* key 157 """ 158 if context is None: 159 return False 160 self._context_vals[context] = val 161 return True
162 #--------------------------------------------------------
163 - def unset_context(self, context=None):
164 try: 165 del self._context_vals[context] 166 except KeyError: 167 pass
168 #------------------------------------------------------------ 169 # usable instances 170 #------------------------------------------------------------
171 -class cMatchProvider_FixedList(cMatchProvider):
172 """Match provider where all possible options can be held 173 in a reasonably sized, pre-allocated list. 174 """
175 - def __init__(self, aSeq = None):
176 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight) 177 """ 178 if not type(aSeq) in [types.ListType, types.TupleType]: 179 _log.error('fixed list match provider argument must be a list or tuple of dicts') 180 raise TypeError('fixed list match provider argument must be a list or tuple of dicts') 181 182 self.__items = aSeq 183 cMatchProvider.__init__(self)
184 #-------------------------------------------------------- 185 # internal matching algorithms 186 # 187 # if we end up here: 188 # - aFragment will not be "None" 189 # - aFragment will be lower case 190 # - we _do_ deliver matches (whether we find any is a different story) 191 #--------------------------------------------------------
192 - def getMatchesByPhrase(self, aFragment):
193 """Return matches for aFragment at start of phrases.""" 194 matches = [] 195 # look for matches 196 for item in self.__items: 197 # at start of phrase, that is 198 if string.find(string.lower(item['label']), aFragment) == 0: 199 matches.append(item) 200 # no matches found 201 if len(matches) == 0: 202 return (False, []) 203 204 matches.sort(self.__cmp_items) 205 return (True, matches)
206 #--------------------------------------------------------
207 - def getMatchesByWord(self, aFragment):
208 """Return matches for aFragment at start of words inside phrases.""" 209 matches = [] 210 # look for matches 211 for item in self.__items: 212 pos = string.find(string.lower(item['label']), aFragment) 213 # found at start of phrase 214 if pos == 0: 215 matches.append(item) 216 # found as a true substring 217 elif pos > 0: 218 # but use only if substring is at start of a word 219 if (item['label'])[pos-1] == ' ': 220 matches.append(item) 221 # no matches found 222 if len(matches) == 0: 223 return (False, []) 224 225 matches.sort(self.__cmp_items) 226 return (True, matches)
227 #--------------------------------------------------------
228 - def getMatchesBySubstr(self, aFragment):
229 """Return matches for aFragment as a true substring.""" 230 matches = [] 231 # look for matches 232 for item in self.__items: 233 if string.find(string.lower(item['label']), aFragment) != -1: 234 matches.append(item) 235 # no matches found 236 if len(matches) == 0: 237 return (False, []) 238 239 matches.sort(self.__cmp_items) 240 return (True, matches)
241 #--------------------------------------------------------
242 - def getAllMatches(self):
243 """Return all items.""" 244 matches = self.__items 245 # no matches found 246 if len(matches) == 0: 247 return (False, []) 248 249 matches.sort(self.__cmp_items) 250 return (True, matches)
251 #--------------------------------------------------------
252 - def set_items(self, items):
253 """items must be a list of dicts. Each dict must have the keys (data, label, weight)""" 254 self.__items = items
255 #--------------------------------------------------------
256 - def __cmp_items(self, item1, item2):
257 """Compare items based on weight.""" 258 if item1['weight'] == item2['weight']: 259 return 0 260 261 # do it the wrong way round to do sorting/reversing at once 262 if item1['weight'] < item2['weight']: 263 return 1 264 if item1['weight'] > item2['weight']: 265 return -1
266 # ===========================================================
267 -class cMatchProvider_Func(cMatchProvider):
268 """Match provider which searches matches 269 in the results of a function call. 270 """
271 - def __init__(self, get_candidates = None):
272 """get_candidates() must return a list of strings.""" 273 if get_candidates is None: 274 _log.error('must define function to retrieve match candidates list') 275 raise ArgumentError('must define function to retrieve match candidates list') 276 277 self._get_candidates = get_candidates 278 cMatchProvider.__init__(self)
279 #-------------------------------------------------------- 280 # internal matching algorithms 281 # 282 # if we end up here: 283 # - aFragment will not be "None" 284 # - aFragment will be lower case 285 # - we _do_ deliver matches (whether we find any is a different story) 286 #--------------------------------------------------------
287 - def getMatchesByPhrase(self, aFragment):
288 """Return matches for aFragment at start of phrases.""" 289 print "getting phrase matches" 290 matches = [] 291 candidates = self._get_candidates() 292 # look for matches 293 for candidate in candidates: 294 # at start of phrase, that is 295 if aFragment.startswith(candidate['label'].lower()): 296 matches.append(candidate) 297 # no matches found 298 if len(matches) == 0: 299 return (False, []) 300 301 matches.sort(self.__cmp_candidates) 302 return (True, matches)
303 #--------------------------------------------------------
304 - def getMatchesByWord(self, aFragment):
305 """Return matches for aFragment at start of words inside phrases.""" 306 print "getting word matches" 307 matches = [] 308 candidates = self._get_candidates() 309 # look for matches 310 for candidate in candidates: 311 pos = candidate['label'].lower().find(aFragment) 312 # pos = string.find(string.lower(candidate['label']), aFragment) 313 # found as a true substring 314 # but use only if substring is at start of a word 315 # FIXME: use word seps 316 if (pos == 0) or (candidate['label'][pos-1] == ' '): 317 matches.append(candidate) 318 # no matches found 319 if len(matches) == 0: 320 return (False, []) 321 322 matches.sort(self.__cmp_candidates) 323 return (True, matches)
324 #--------------------------------------------------------
325 - def getMatchesBySubstr(self, aFragment):
326 """Return matches for aFragment as a true substring.""" 327 matches = [] 328 candidates = self._get_candidates() 329 # look for matches 330 for candidate in candidates: 331 if candidate['label'].lower().find(aFragment) != -1: 332 # if string.find(string.lower(candidate['label']), aFragment) != -1: 333 matches.append(candidate) 334 # no matches found 335 if len(matches) == 0: 336 return (False, []) 337 338 matches.sort(self.__cmp_candidates) 339 return (True, matches)
340 #--------------------------------------------------------
341 - def getAllMatches(self):
342 """Return all candidates.""" 343 return self._get_candidates()
344 #--------------------------------------------------------
345 - def __cmp_candidates(self, candidate1, candidate2):
346 """naive ordering""" 347 return 0
348 # FIXME: do ordering 349 # if candidate1 < candidate2: 350 # return -1 351 # if candidate1 == candidate2: 352 # return 0 353 # return 1 354 355 # ===========================================================
356 -class cMatchProvider_SQL2(cMatchProvider):
357 """Match provider which searches matches 358 in possibly several database tables. 359 360 queries: 361 - a list of unicode strings 362 - each string is a query 363 - each string must contain: "... where <column> %(fragment_condition)s ..." 364 - each string can contain in the where clause: "... %(<context_key>)s ..." 365 366 context definitions to be used in the queries 367 example: {'ctxt_country': {'where_part': 'and country = %(country)s', 'placeholder': 'country'}} 368 """
369 - def __init__(self, queries = None, context = None):
370 if type(queries) != types.ListType: 371 queries = [queries] 372 373 self._queries = queries 374 375 if context is None: 376 self._context = {} 377 else: 378 self._context = context 379 380 self._args = {} 381 cMatchProvider.__init__(self)
382 #-------------------------------------------------------- 383 # internal matching algorithms 384 # 385 # if we end up here: 386 # - aFragment will not be "None" 387 # - aFragment will be lower case 388 # - we _do_ deliver matches (whether we find any is a different story) 389 #--------------------------------------------------------
390 - def getMatchesByPhrase(self, aFragment):
391 """Return matches for aFragment at start of phrases.""" 392 393 fragment_condition = u"ILIKE %(fragment)s" 394 self._args['fragment'] = u"%s%%" % aFragment 395 396 return self.__find_matches(fragment_condition)
397 #--------------------------------------------------------
398 - def getMatchesByWord(self, aFragment):
399 """Return matches for aFragment at start of words inside phrases.""" 400 401 fragment_condition = u"~* %(fragment)s" 402 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False) 403 self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment) 404 405 return self.__find_matches(fragment_condition)
406 #--------------------------------------------------------
407 - def getMatchesBySubstr(self, aFragment):
408 """Return matches for aFragment as a true substring.""" 409 410 fragment_condition = u"ILIKE %(fragment)s" 411 self._args['fragment'] = u"%%%s%%" % aFragment 412 413 return self.__find_matches(fragment_condition)
414 #--------------------------------------------------------
415 - def getAllMatches(self):
416 """Return all items.""" 417 return self.getMatchesBySubstr(u'')
418 #--------------------------------------------------------
419 - def __find_matches(self, fragment_condition):
420 matches = [] 421 for query in self._queries: 422 where_fragments = {'fragment_condition': fragment_condition} 423 424 for context_key, context_def in self._context.items(): 425 try: 426 placeholder = context_def['placeholder'] 427 where_part = context_def['where_part'] 428 self._args[placeholder] = self._context_vals[placeholder] 429 # we do have a context value for this key, so add the where condition 430 where_fragments[context_key] = where_part 431 if self.print_queries: 432 print placeholder 433 print where_part 434 print self._context_vals[placeholder] 435 except KeyError: 436 # we don't have a context value for this key, so skip the where condition 437 where_fragments[context_key] = u'' 438 439 cmd = query % where_fragments 440 441 if self.print_queries: 442 print self.__class__.__name__ 443 print self._context_vals 444 print self._args 445 print cmd 446 447 try: 448 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}]) 449 except: 450 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__) 451 idx = self._queries.index(query) 452 del self._queries[idx] 453 break 454 455 # no matches found: try next query 456 if len(rows) == 0: 457 continue 458 459 for row in rows: 460 matches.append({'data': row[0], 'label': row[1], 'weight': 0}) 461 462 return (True, matches) 463 # none found whatsoever 464 return (False, [])
465 #================================================================ 466 if __name__ == '__main__': 467 pass 468 469 #================================================================ 470