1
2
3
4
5
6
7
8
9
10
11
12 _NINF = float('-1e300')
13
14 """
15 Classes for representing and processing probabilistic information.
16
17 The L{FreqDist} class is used to encode X{frequency distributions},
18 which count the number of times that each outcome of an experiment
19 occurs.
20
21 The L{ProbDistI} class defines a standard interface for X{probability
22 distributions}, which encode the probability of each outcome for an
23 experiment. There are two types of probability distribution:
24
25 - X{derived probability distributions} are created from frequency
26 distributions. They attempt to model the probability distribution
27 that generated the frequency distribution.
28 - X{analytic probability distributions} are created directly from
29 parameters (such as variance).
30
31 The L{ConditionalFreqDist} class and L{ConditionalProbDistI} interface
32 are used to encode conditional distributions. Conditional probability
33 distributions can be derived or analytic; but currently the only
34 implementation of the C{ConditionalProbDistI} interface is
35 L{ConditionalProbDist}, a derived distribution.
36
37 """
38
39 import math
40 import random
41 import warnings
42
43
44
45
46
48 """
49 A frequency distribution for the outcomes of an experiment. A
50 frequency distribution records the number of times each outcome of
51 an experiment has occurred. For example, a frequency distribution
52 could be used to record the frequency of each word type in a
53 document. Formally, a frequency distribution can be defined as a
54 function mapping from each sample to the number of times that
55 sample occurred as an outcome.
56
57 Frequency distributions are generally constructed by running a
58 number of experiments, and incrementing the count for a sample
59 every time it is an outcome of an experiment. For example, the
60 following code will produce a frequency distribution that encodes
61 how often each word occurs in a text:
62
63 >>> fdist = FreqDist()
64 >>> for word in tokenize.whitespace(sent):
65 ... fdist.inc(word.lower())
66
67 An equivalent way to do this is with the initializer:
68
69 >>> fdist = FreqDist(word.lower() for word in tokenize.whitespace(sent))
70
71 """
73 """
74 Construct a new frequency distribution. If C{samples} is
75 given, then the frequency distribution will be initialized
76 with the count of each object in C{samples}; otherwise, it
77 will be initialized to be empty.
78
79 In particular, C{FreqDist()} returns an empty frequency
80 distribution; and C{FreqDist(samples)} first creates an empty
81 frequency distribution, and then calls C{inc} for each element
82 in the list C{samples}.
83
84 @param samples: The samples to initialize the frequency
85 distribution with.
86 @type samples: Sequence
87 """
88 dict.__init__(self)
89 self._N = 0
90 self._Nr_cache = None
91 self._max_cache = None
92 if samples:
93 for sample in samples:
94 self.inc(sample)
95
96 - def inc(self, sample, count=1):
97 """
98 Increment this C{FreqDist}'s count for the given
99 sample.
100
101 @param sample: The sample whose count should be incremented.
102 @type sample: any
103 @param count: The amount to increment the sample's count by.
104 @type count: C{int}
105 @rtype: None
106 @raise NotImplementedError: If C{sample} is not a
107 supported sample type.
108 """
109 if count == 0: return
110
111 self._N += count
112 self[sample] = self.get(sample,0) + count
113
114
115 self._Nr_cache = None
116 self._max_cache = None
117
119 """
120 @return: The total number of sample outcomes that have been
121 recorded by this C{FreqDist}. For the number of unique
122 sample values (or bins) with counts greater than zero, use
123 C{FreqDist.B()}.
124 @rtype: C{int}
125 """
126 return self._N
127
129 """
130 @return: The total number of sample values (or X{bins}) that
131 have counts greater than zero. For the total
132 number of sample outcomes recorded, use C{FreqDist.N()}.
133 @rtype: C{int}
134 """
135 return len(self)
136
138 """
139 @return: A list of all samples that have been recorded as
140 outcomes by this frequency distribution. Use C{count()}
141 to determine the count for each sample.
142 @rtype: C{list}
143 """
144 return self.keys()
145
146 - def Nr(self, r, bins=None):
147 """
148 @return: The number of samples with count r.
149 @rtype: C{int}
150 @type r: C{int}
151 @param r: A sample count.
152 @type bins: C{int}
153 @param bins: The number of possible sample outcomes. C{bins}
154 is used to calculate Nr(0). In particular, Nr(0) is
155 C{bins-self.B()}. If C{bins} is not specified, it
156 defaults to C{self.B()} (so Nr(0) will be 0).
157 """
158 if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative'
159
160
161 if r == 0:
162 if bins is None: return 0
163 else: return bins-self.B()
164
165
166
167
168 if self._Nr_cache is None:
169 self._cache_Nr_values()
170
171 if r >= len(self._Nr_cache): return 0
172 return self._Nr_cache[r]
173
175 Nr = [0]
176 for sample in self:
177 c = self.get(sample, 0)
178 if c >= len(Nr):
179 Nr += [0]*(c+1-len(Nr))
180 Nr[c] += 1
181 self._Nr_cache = Nr
182
183 - def count(self, sample):
184 """
185 Return the count of a given sample. The count of a sample is
186 defined as the number of times that sample outcome was
187 recorded by this C{FreqDist}. Counts are non-negative
188 integers. This method has been replaced by conventional
189 dictionary indexing; use fd[item] instead of fd.count(item).
190
191 @return: The count of a given sample.
192 @rtype: C{int}
193 @param sample: the sample whose count
194 should be returned.
195 @type sample: any.
196 """
197 raise AttributeError, "Use indexing to look up an entry in a FreqDist, e.g. fd[item]"
198
199 - def freq(self, sample):
200 """
201 Return the frequency of a given sample. The frequency of a
202 sample is defined as the count of that sample divided by the
203 total number of sample outcomes that have been recorded by
204 this C{FreqDist}. The count of a sample is defined as the
205 number of times that sample outcome was recorded by this
206 C{FreqDist}. Frequencies are always real numbers in the range
207 [0, 1].
208
209 @return: The frequency of a given sample.
210 @rtype: float
211 @param sample: the sample whose frequency
212 should be returned.
213 @type sample: any
214 """
215 if self._N is 0:
216 return 0
217 return float(self[sample]) / self._N
218
220 """
221 Return the sample with the greatest number of outcomes in this
222 frequency distribution. If two or more samples have the same
223 number of outcomes, return one of them; which sample is
224 returned is undefined. If no outcomes have occurred in this
225 frequency distribution, return C{None}.
226
227 @return: The sample with the maximum number of outcomes in this
228 frequency distribution.
229 @rtype: any or C{None}
230 """
231 if self._max_cache is None:
232 best_sample = None
233 best_count = -1
234 for sample in self:
235 if self[sample] > best_count:
236 best_sample = sample
237 best_count = self[sample]
238 self._max_cache = best_sample
239 return self._max_cache
240
242 raise AttributeError, "Use FreqDist.sorted() to get the sorted samples"
243
244 - def plot(self, samples=None, *args, **kwargs):
245 """
246 Plot the given samples from the frequency distribution.
247 If no samples are specified, use all samples, in lexical sort order.
248 (Requires Matplotlib to be installed.)
249
250 @param samples: The samples to plot.
251 @type samples: C{list}
252 """
253 try:
254 import pylab
255 except ImportError:
256 raise ValueError('The plot function requires the matplotlib package.'
257 'See http://matplotlib.sourceforge.net/')
258 if not samples:
259 samples = sorted(self.samples())
260 values = [self[sample] for sample in samples]
261 if not args:
262 args = ["bo"]
263 pylab.grid(True, color="silver")
264 pylab.semilogy(values, *args, **kwargs)
265 pylab.xticks(range(len(samples)), samples, rotation=45, color="b")
266 pylab.show()
267
268 - def zipf_plot(self, num=40, *args, **kwargs):
269 """
270 Plot the most frequent samples of the frequency distribution.
271 (Requires Matplotlib to be installed.)
272
273 @param num: The number of samples to plot.
274 @type num: C{int}
275 """
276 samples = self.sorted()[:num]
277 self.plot(samples, *args, **kwargs)
278
279
281 """
282 Return the samples sorted in decreasing order of frequency. Instances
283 with the same count will be arbitrarily ordered. Instances with a
284 count of zero will be omitted. This method is C{O(N^2)}, where C{N} is
285 the number of samples, but will complete in a shorter time on average.
286
287 @return: The set of samples in sorted order.
288 @rtype: sequence of any
289 """
290 from operator import itemgetter
291 return [sample for (sample, count) in sorted(self.items(), key=itemgetter(1), reverse=True)]
292
294 """
295 @return: A string representation of this C{FreqDist}.
296 @rtype: string
297 """
298 return '<FreqDist with %d samples>' % self.N()
299
301 """
302 @return: A string representation of this C{FreqDist}.
303 @rtype: string
304 """
305 items = ['%r: %r' % (s, self[s]) for s in self.sorted()]
306 return '<FreqDist: %s>' % ', '.join(items)
307