Package nltk :: Module probability
[hide private]
[frames] | no frames]

Source Code for Module nltk.probability

   1  # Natural Language Toolkit: Probability and Statistics 
   2  # 
   3  # Copyright (C) 2001-2008 NLTK Project 
   4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   5  #         Steven Bird <sb@csse.unimelb.edu.au> (additions) 
   6  #         Trevor Cohn <tacohn@cs.mu.oz.au> (additions) 
   7  # URL: <http://nltk.org> 
   8  # For license information, see LICENSE.TXT 
   9  # 
  10  # $Id: probability.py 6445 2008-08-16 07:46:34Z stevenbird $ 
  11   
  12  _NINF = float('-1e300') 
  13   
  14  """ 
  15  Classes for representing and processing probabilistic information. 
  16   
  17  The L{FreqDist} class is used to encode X{frequency distributions}, 
  18  which count the number of times that each outcome of an experiment 
  19  occurs. 
  20   
  21  The L{ProbDistI} class defines a standard interface for X{probability 
  22  distributions}, which encode the probability of each outcome for an 
  23  experiment.  There are two types of probability distribution: 
  24   
  25    - X{derived probability distributions} are created from frequency 
  26      distributions.  They attempt to model the probability distribution 
  27      that generated the frequency distribution. 
  28    - X{analytic probability distributions} are created directly from 
  29      parameters (such as variance). 
  30   
  31  The L{ConditionalFreqDist} class and L{ConditionalProbDistI} interface 
  32  are used to encode conditional distributions.  Conditional probability 
  33  distributions can be derived or analytic; but currently the only 
  34  implementation of the C{ConditionalProbDistI} interface is 
  35  L{ConditionalProbDist}, a derived distribution. 
  36   
  37  """ 
  38   
  39  import math 
  40  import random 
  41  import warnings 
  42   
  43  ##////////////////////////////////////////////////////// 
  44  ##  Frequency Distributions 
  45  ##////////////////////////////////////////////////////// 
  46   
47 -class FreqDist(dict):
48 """ 49 A frequency distribution for the outcomes of an experiment. A 50 frequency distribution records the number of times each outcome of 51 an experiment has occurred. For example, a frequency distribution 52 could be used to record the frequency of each word type in a 53 document. Formally, a frequency distribution can be defined as a 54 function mapping from each sample to the number of times that 55 sample occurred as an outcome. 56 57 Frequency distributions are generally constructed by running a 58 number of experiments, and incrementing the count for a sample 59 every time it is an outcome of an experiment. For example, the 60 following code will produce a frequency distribution that encodes 61 how often each word occurs in a text: 62 63 >>> fdist = FreqDist() 64 >>> for word in tokenize.whitespace(sent): 65 ... fdist.inc(word.lower()) 66 67 An equivalent way to do this is with the initializer: 68 69 >>> fdist = FreqDist(word.lower() for word in tokenize.whitespace(sent)) 70 71 """
72 - def __init__(self, samples=None):
73 """ 74 Construct a new frequency distribution. If C{samples} is 75 given, then the frequency distribution will be initialized 76 with the count of each object in C{samples}; otherwise, it 77 will be initialized to be empty. 78 79 In particular, C{FreqDist()} returns an empty frequency 80 distribution; and C{FreqDist(samples)} first creates an empty 81 frequency distribution, and then calls C{inc} for each element 82 in the list C{samples}. 83 84 @param samples: The samples to initialize the frequency 85 distribution with. 86 @type samples: Sequence 87 """ 88 dict.__init__(self) 89 self._N = 0 90 self._Nr_cache = None 91 self._max_cache = None 92 if samples: 93 for sample in samples: 94 self.inc(sample)
95
96 - def inc(self, sample, count=1):
97 """ 98 Increment this C{FreqDist}'s count for the given 99 sample. 100 101 @param sample: The sample whose count should be incremented. 102 @type sample: any 103 @param count: The amount to increment the sample's count by. 104 @type count: C{int} 105 @rtype: None 106 @raise NotImplementedError: If C{sample} is not a 107 supported sample type. 108 """ 109 if count == 0: return 110 111 self._N += count 112 self[sample] = self.get(sample,0) + count 113 114 # Invalidate the Nr cache and max cache. 115 self._Nr_cache = None 116 self._max_cache = None
117
118 - def N(self):
119 """ 120 @return: The total number of sample outcomes that have been 121 recorded by this C{FreqDist}. For the number of unique 122 sample values (or bins) with counts greater than zero, use 123 C{FreqDist.B()}. 124 @rtype: C{int} 125 """ 126 return self._N
127
128 - def B(self):
129 """ 130 @return: The total number of sample values (or X{bins}) that 131 have counts greater than zero. For the total 132 number of sample outcomes recorded, use C{FreqDist.N()}. 133 @rtype: C{int} 134 """ 135 return len(self)
136
137 - def samples(self):
138 """ 139 @return: A list of all samples that have been recorded as 140 outcomes by this frequency distribution. Use C{count()} 141 to determine the count for each sample. 142 @rtype: C{list} 143 """ 144 return self.keys()
145
146 - def Nr(self, r, bins=None):
147 """ 148 @return: The number of samples with count r. 149 @rtype: C{int} 150 @type r: C{int} 151 @param r: A sample count. 152 @type bins: C{int} 153 @param bins: The number of possible sample outcomes. C{bins} 154 is used to calculate Nr(0). In particular, Nr(0) is 155 C{bins-self.B()}. If C{bins} is not specified, it 156 defaults to C{self.B()} (so Nr(0) will be 0). 157 """ 158 if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative' 159 160 # Special case for Nr(0): 161 if r == 0: 162 if bins is None: return 0 163 else: return bins-self.B() 164 165 # We have to search the entire distribution to find Nr. Since 166 # this is an expensive operation, and is likely to be used 167 # repeatedly, cache the results. 168 if self._Nr_cache is None: 169 self._cache_Nr_values() 170 171 if r >= len(self._Nr_cache): return 0 172 return self._Nr_cache[r]
173
174 - def _cache_Nr_values(self):
175 Nr = [0] 176 for sample in self: 177 c = self.get(sample, 0) 178 if c >= len(Nr): 179 Nr += [0]*(c+1-len(Nr)) 180 Nr[c] += 1 181 self._Nr_cache = Nr
182
183 - def count(self, sample):
184 """ 185 Return the count of a given sample. The count of a sample is 186 defined as the number of times that sample outcome was 187 recorded by this C{FreqDist}. Counts are non-negative 188 integers. This method has been replaced by conventional 189 dictionary indexing; use fd[item] instead of fd.count(item). 190 191 @return: The count of a given sample. 192 @rtype: C{int} 193 @param sample: the sample whose count 194 should be returned. 195 @type sample: any. 196 """ 197 raise AttributeError, "Use indexing to look up an entry in a FreqDist, e.g. fd[item]"
198
199 - def freq(self, sample):
200 """ 201 Return the frequency of a given sample. The frequency of a 202 sample is defined as the count of that sample divided by the 203 total number of sample outcomes that have been recorded by 204 this C{FreqDist}. The count of a sample is defined as the 205 number of times that sample outcome was recorded by this 206 C{FreqDist}. Frequencies are always real numbers in the range 207 [0, 1]. 208 209 @return: The frequency of a given sample. 210 @rtype: float 211 @param sample: the sample whose frequency 212 should be returned. 213 @type sample: any 214 """ 215 if self._N is 0: 216 return 0 217 return float(self[sample]) / self._N
218
219 - def max(self):
220 """ 221 Return the sample with the greatest number of outcomes in this 222 frequency distribution. If two or more samples have the same 223 number of outcomes, return one of them; which sample is 224 returned is undefined. If no outcomes have occurred in this 225 frequency distribution, return C{None}. 226 227 @return: The sample with the maximum number of outcomes in this 228 frequency distribution. 229 @rtype: any or C{None} 230 """ 231 if self._max_cache is None: 232 best_sample = None 233 best_count = -1 234 for sample in self: 235 if self[sample] > best_count: 236 best_sample = sample 237 best_count = self[sample] 238 self._max_cache = best_sample 239 return self._max_cache
240
241 - def sorted_samples(self):
242 raise AttributeError, "Use FreqDist.sorted() to get the sorted samples"
243
244 - def plot(self, samples=None, *args, **kwargs):
245 """ 246 Plot the given samples from the frequency distribution. 247 If no samples are specified, use all samples, in lexical sort order. 248 (Requires Matplotlib to be installed.) 249 250 @param samples: The samples to plot. 251 @type samples: C{list} 252 """ 253 try: 254 import pylab 255 except ImportError: 256 raise ValueError('The plot function requires the matplotlib package.' 257 'See http://matplotlib.sourceforge.net/') 258 if not samples: 259 samples = sorted(self.samples()) 260 values = [self[sample] for sample in samples] 261 if not args: 262 args = ["bo"] 263 pylab.grid(True, color="silver") 264 pylab.semilogy(values, *args, **kwargs) 265 pylab.xticks(range(len(samples)), samples, rotation=45, color="b") 266 pylab.show()
267
268 - def zipf_plot(self, num=40, *args, **kwargs):
269 """ 270 Plot the most frequent samples of the frequency distribution. 271 (Requires Matplotlib to be installed.) 272 273 @param num: The number of samples to plot. 274 @type num: C{int} 275 """ 276 samples = self.sorted()[:num] 277 self.plot(samples, *args, **kwargs)
278 279 # SB: cache the sorted samples?
280 - def sorted(self):
281 """ 282 Return the samples sorted in decreasing order of frequency. Instances 283 with the same count will be arbitrarily ordered. Instances with a 284 count of zero will be omitted. This method is C{O(N^2)}, where C{N} is 285 the number of samples, but will complete in a shorter time on average. 286 287 @return: The set of samples in sorted order. 288 @rtype: sequence of any 289 """ 290 from operator import itemgetter 291 return [sample for (sample, count) in sorted(self.items(), key=itemgetter(1), reverse=True)]
292
293 - def __repr__(self):
294 """ 295 @return: A string representation of this C{FreqDist}. 296 @rtype: string 297 """ 298 return '<FreqDist with %d samples>' % self.N()
299
300 - def __str__(self):
301 """ 302 @return: A string representation of this C{FreqDist}. 303 @rtype: string 304 """ 305 items = ['%r: %r' % (s, self[s]) for s in self.sorted()] 306 return '<FreqDist: %s>' % ', '.join(items)
307
308 - def __getitem__(self, sample):
309 return self.