source: trunk/doc/stats_weighted.tex @ 367

Last change on this file since 367 was 338, checked in by Peter, 17 years ago

added document about weighted statistics

File size: 10.0 KB
Line 
1\documentstyle[12pt]{article}
2%\documentstyle[slidesonly]{seminar}
3
4% $Id: error.tex,v 1.11 2004/10/01 09:08:39 peter Exp $
5
6%\input{psfig}
7
8% --- This bit puts ``draft'' over everything! ---
9%\special{!userdict begin /bop-hook{gsave 200 30 translate
10%65 rotate /Times-Roman findfont 216 scalefont setfont
11%0 0 moveto 0.93 setgray (DRAFT) show grestore}def end}
12%  --- end of this bit that puts `draft' over everything ---
13
14
15\flushbottom
16\footskip 54pt
17\headheight 0pt
18\headsep 0pt
19\oddsidemargin 0pt
20\parindent 0pt
21\parskip 2ex
22\textheight 230mm
23\textwidth 165mm
24\topmargin 0pt
25
26\newcommand{\bea} {\begin{eqnarray}}
27\newcommand{\eea} {\end{eqnarray}}
28\newcommand{\beq} {\begin{equation}}
29\newcommand{\eeq} {\end{equation}}
30\newcommand{\bibl}[5]
31        {#1, {\it #2} {\bf #3} (#4) #5}
32\newcommand{\ol}{\overline}
33
34\renewcommand{\baselinestretch} {1.0}
35\renewcommand{\textfraction} {0.1}
36\renewcommand{\topfraction} {1.0}
37\renewcommand{\bottomfraction} {1.0}
38\renewcommand{\floatpagefraction} {1.0}
39
40\renewcommand{\d}{{\mathrm{d}}}
41\newcommand{\nd}{$^{\mathrm{nd}}$}
42\newcommand{\eg}{{\it {e.g.}}}
43\newcommand{\ie}{{\it {i.e., }}}
44\newcommand{\etal}{{\it {et al.}}}
45\newcommand{\eref}[1]{Eq.~(\ref{e:#1})}
46\newcommand{\fref}[1]{Fig.~\ref{f:#1}}
47\newcommand{\ovr}[2]{\left(\begin{array}{c} #1 \\ #2 \end{array}\right)}
48
49% Use these to include comments and remarks into the text, these will
50% obviously appear as footnotes in the final output.
51\newcommand{\CR}[1]{\footnote{CR: #1}}
52\newcommand{\JH}[1]{\footnote{JH: #1}}
53\newcommand{\PE}[1]{\footnote{PE: #1}}
54\newcommand{\PJ}[1]{\footnote{PJ: #1}}
55
56\begin{document}
57
58\large
59{\bf Weighted Statistics}
60\normalsize
61
62\tableofcontents
63\clearpage
64
65\section{Introduction}
66
67
68\section{WeightedAverager}
69
70Building an estimator $T$ of the parameter $\Theta$ there
71is a number of criterias that is welcome to be fullfilled.
72
73\begin{itemize}
74\item The bias $b=<T> - \Theta$ should be zero. The
75estimator is unbiased.
76
77\item The estimator is efficient if the mean squared error
78$<(T-\Theta)^2>$ is small. If the estimator is unbiased the
79mean squared error is equal to the variance
80$<(T-<T>)^2>$ of the estimator.
81
82\item The estimator is consistent if the mean squared error goes to
83zero in the limit of infinite number of data points.
84
85\item adding a data point with weight zero should not change the
86estimator.
87
88\end{itemize}
89
90We will use these criterias to find the estimator. We will minimize
91the variance with the constraint that the estimator must be
92unbiased. Also, we check that the estimator is consistent and handles
93zero weight in a desired way.
94
95
96\subsection{Mean}
97We start building an estimator of the mean in the case with varying
98variance.
99
100The likelihood is
101
102\beq
103L(m)=\prod (2\pi\sigma_i^2)^{-1/2}e^{-\frac{(x_i-m)^2}{2\sigma_i^2}},
104\eeq
105
106which we want to maximize, but can equally well minimize
107
108\beq
109-\ln L(m) = \sum \frac{1}{2}\ln2\pi\sigma_i^2+\frac{(x_i-m)^2}{2\sigma_i^2}.
110\eeq
111
112Taking the derivity yields
113
114\beq
115\frac{d-\ln L(m)}{dm}=\sum - \frac{x_i-m}{\sigma_i^2}
116\eeq
117
118Hence, the Maximum Likelihood method yields the estimator
119
120\beq
121m=\frac{\sum x_i/\sigma_i^2}{\sum 1/\sigma_i^2}
122\eeq
123
124Let us check the criterias defined above.
125First, we want the estimator to be unbiased.
126
127\beq
128b=<m>-\mu=\frac{\sum <x_i>/\sigma_i^2}{\sum 1/\sigma_i^2}-\mu=\frac{\sum \mu/\sigma_i^2}{\sum 1/\sigma_i^2}-\mu=0,
129\eeq
130
131so the estimator is unbiased.
132
133Second, we examine how efficient the estimator is, \ie how small the
134variance is.
135
136\beq
137V(m)=\frac{\sum V(x_i)/\sigma_i^2}{\left(\sum 1/\sigma_i^2\right)^2}=\frac{1}{\sum 1/\sigma_i^2},
138\eeq
139
140which obviously goes to zero when number of samples (with finite
141$\sigma$) goes to infinity. The estimator is consistent.
142
143Trivially we can see that a zero weight data point does not change the
144estimator, which was our last condition.
145
146\subsection{Variance}
147Let us now examine the case when we do not know the variance, but only
148the weight $w_i$ that is proportional to the inverse
149of the variance $\sigma_i^2$
150
151\beq
152w_i=\frac{\kappa}{\sigma_i^2},
153\eeq
154
155and the variance is
156
157\beq
158V(m)=\frac{1}{\sum 1/\sigma_i^2}=\frac{\kappa}{\sum w_i}
159\eeq
160
161so we need to estimate $\kappa$. The likelihood is now
162
163\beq
164L(k)=P(k)\prod \frac{\sqrt{w_i}} {\sqrt{2\pi k}}e^{-\frac{w_i(x_i-m)^2}{2k}},
165\eeq
166
167where $P(k)$ is the prior probabilty distribution. If we have no prior knowledge
168about $k$, $P(k)$ is constant.
169
170Taking the derivity of the logarithm again yields
171
172\beq
173\frac{d-\ln L(k)}{dk}=-\frac{d\ln P(k)}{dk}+\sum \left(\frac{1}{2k}-\frac{w_i(x_i-m)^2}{2k^2}\right)
174\eeq
175
176or equivalently
177
178\beq
179k=\frac{1}{N}\sum w_i(x_i-m)^2+2k^2\frac{d\ln P(k)}{dk}
180\eeq
181
182In principle, any prior probabilty distribution $P(k)$ could be
183used. Here we, for simplicity, focus on the one where the last term
184becomes a constant, namely
185
186\beq
187P(k)=\exp(-\lambda/k).
188\eeq
189
190One problem with this choice is that we have to truncate the
191distribution in order to normalize it.
192
193The estimator $k$ becomes
194
195\beq
196k=\frac{1}{N}\sum w_i(x_i-m)^2+A,
197\eeq
198
199where $A$ is constant (depending on $\lambda$ and the truncation point).
200
201Having an estimation of $\kappa$ we can calculate the variance of $m$
202
203\beq
204V(m)=\frac{\frac{1}{N}\sum w_i(x_i-m)^2+A}{\sum w_i}=\frac{\frac{1}{N}\sum (x_i-m)^2/\sigma_i^2}{\sum 1/\sigma_i^2}+\frac{A}{\sum 1/\sigma_i^2}
205\eeq
206
207Let us now look at estimation of $\kappa$. Is the criterias above
208fullfilled? We start looking at the bias
209
210\beq
211b=<k>-\kappa=\frac{1}{N}\sum w_i\left<(x_i-m)^2\right>-\kappa
212\eeq
213
214Let us look at
215
216\bea
217\left<(x_i-m)^2\right>=
218\\\left<(x_i-\mu)^2+(m-\mu)^2-2(x_i-\mu)(m-\mu)\right>
219\\V(x_i)+V(m)-2\left<(x_i-\mu)(m-\mu)\right>
220\\V(x_i)+V(m)-2\left<(x_i-\mu)(\frac{\sum_j x_j/\sigma_j^2}{\sum_k 1/\sigma_k^2}-\mu)\right>
221\\V(x_i)+V(m)-2\left<(\frac{(x_i-\mu)^2/\sigma_j^2}{\sum_k 1/\sigma_k^2})\right>
222\\V(x_i)+V(m)-2\frac{1}{\sum_k 1/\sigma_k^2}
223\\\sigma_i^2-\frac{1}{\sum_k 1/\sigma_k^2}
224\eea
225
226so the bias is
227
228\bea
229b=<k>-\kappa=
230\\\frac{1}{N}\sum_i (w_i\sigma_i^2-\frac{w_i}{\sum_k 1/\sigma_k^2})-\kappa=
231\\\frac{\kappa}{N}\sum_i (1-\frac{w_i}{\sum_k w_k})-\kappa=
232\\\frac{\kappa}{N}(N-1)-\kappa
233\eea
234
235so the estimator is asymptotically unbiased. If we want the estimation
236to be unbiased we could \eg modify $N$ to $N-1$ , exactly as in the
237unweighted case, and we get the following estimator of $\kappa$
238
239\beq
240k=\frac{1}{N-1}\sum w_i(x_i-m)^2
241\eeq
242
243One problem with this estimator is that it is sensitive to weight zero
244samples due to the $N$. To solve that we have to express $N$ using
245$w$, wich will make the estimator biased. We suggest the substitution
246
247\beq
248N\rightarrow\frac{(\sum w_i)^2}{\sum w_i^2}
249\eeq
250
251so the estimator finally becomes
252
253\beq
254k=\frac{\sum w_i^2}{(\sum w_i)^2-\sum w_i^2}\sum w_i(x_i-m)^2
255\eeq
256
257and the variance is
258
259\beq
260V(m)=\frac{\sum w_i^2}{(\sum w_i)^2-\sum w_i^2}\frac{\sum w_i(x_i-m)^2}{\sum w_i}+\frac{A}{\sum 1/\sigma_i^2}
261\eeq
262
263
264\section{Score}
265\subsection{Pearson}
266
267Pearson correlation is defined as:
268\beq
269\frac{\sum_i(x_i-\bar{x})(y_i-\bar{y})}{\sqrt{\sum_i (x_i-\bar{x})^2\sum_i (x_i-\bar{x})^2}}.
270\eeq
271The weighted version should satisfy the following conditions:
272
273\begin{itemize}
274\item Having N equal weights the expression reduces to the unweighted case.
275\item Adding a pair of data where one the weight is zero does not change the expression.
276\item When $x$ and $y$ are identical, the correlation is one.
277\end{itemize}
278
279Therefore we define the weighted correlation to be
280\beq
281\frac{\sum_iw_i^xw_i^y(x_i-\bar{x})(y_i-\bar{y})}{\sqrt{\sum_iw_i^xw_i^y(x_i-\bar{x})^2\sum_iw_i^xw_i^y(x_i-\bar{x})^2}},
282\eeq
283where
284\beq
285\bar{x}=\frac{\sum_i w^x_iw^y_ix_i}{\sum_i w^x_iw^y_i}
286\eeq
287and
288\beq
289\bar{y}=\frac{\sum_i w^x_iw^y_iy_i}{\sum_i w^x_iw^y_i}.
290\eeq
291
292\subsection{ROC}
293If we have a set of values $x^+$ from class + and a set of values
294$x^-$ from class -, the ROC curve area is equal to
295
296\beq
297\frac{\sum_{\{i,j\}:x^-_i<x^+_j}1}{\sum_{i,j}1}
298\eeq
299
300so a natural extension using weights could be
301
302\beq
303\frac{\sum_{\{i,j\}:x^-_i<x^+_j}w^-_iw^+_j}{\sum_{i,j}w^-_iw^+_j}
304\eeq
305
306\section{Hierarchical clustering}
307\label{hc}
308A hierarchical clustering consists of two things: finding the two
309closest data points, merge these two data points two a new data point
310and calculate the new distances from this point to all other points.\\
311
312In the first item, we need a distance matrix, and if we use Euclidean
313distanses the natural modification of the expression would be
314
315\beq
316d(x,y)=\frac{\sum w_i^xw_j^y(x_i-y_i)^2}{\sum w_i^xw_j^y} \eeq \\
317
318For the second item, inspired by average linkage, we suggest
319
320\beq
321d(xy,z)=\frac{\sum w_i^xw_j^z(x_i-z_i)^2+\sum w_i^yw_j^z(y_i-z_i)^2}{\sum w_i^xw_j^z+\sum w_i^yw_j^z}
322\eeq
323
324to be the distance between the new merged point $xy$ and $z$, and we
325also calculate new weights for this point: $w^{xy}_i=w^x_i+w^y_i$
326
327\section{Regression}
328We have the model
329
330\beq 
331y_i=\alpha+\beta (x-m_x)+\epsilon_i,
332\eeq 
333
334where $\epsilon_i$ is the noise. The variance of the noise is
335inversely proportional to the weight,
336$Var(\epsilon_i)=\frac{\sigma^2}{w_i}$. In order to determine the
337model parameters, we minimimize the sum of quadratic errors.
338
339\beq
340Q_0 = \sum \epsilon_i^2
341\eeq
342
343Taking the derivity with respect to $\alpha$ and $\beta$ yields two conditions
344
345\beq
346\frac{\partial Q_0}{\partial \alpha} = -2 \sum w_i(y_i - \alpha - \beta (x_i-m_x)=0
347\eeq
348
349and
350
351\beq
352\frac{\partial Q_0}{\partial \beta} = -2 \sum w_i(x_i-m_x)(y_i-\alpha-\beta(x_i-m_x)=0
353\eeq
354
355or equivalently
356
357\beq
358\alpha = \frac{\sum w_iy_i}{\sum w_i}=m_y
359\eeq
360
361and
362
363\beq
364\beta=\frac{\sum w_i(x_i-m_x)(y-m_y)}{\sum w_i(x_i-m_x)^2}
365\eeq
366
367Note, by having all weights equal we get back the unweighted
368case. Furthermore, we calculate the variance of the estimators of
369$\alpha$ and $\beta$.
370
371\beq 
372\textrm{Var}(\alpha )=\frac{w_i^2\frac{\sigma^2}{w_i}}{(\sum w_i)^2}=
373\frac{\sigma^2}{\sum w_i}
374\eeq
375
376and
377\beq
378\textrm{Var}(\beta )= \frac{w_i^2(x_i-m_x)^2\frac{\sigma^2}{w_i}}
379{(\sum w_i(x_i-m_x)^2)^2}=
380\frac{\sigma^2}{\sum w_i(x_i-m_x)^2}
381\eeq
382
383Finally, we estimate the level of noise, $\sigma^2$. Inspired by the
384unweighted estimation
385
386\beq
387s^2=\frac{\sum (y_i-\alpha-\beta (x_i-m_x))^2}{n-2}
388\eeq
389
390we suggest the following estimator
391
392\beq
393s^2=\frac{\sum w_i(y_i-\alpha-\beta (x_i-m_x))^2}{\sum w_i-2\frac{\sum w_i^2}{\sum w_i}}
394\eeq
395
396\end{document}
397
398
399
Note: See TracBrowser for help on using the repository browser.