Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
# when raw index has a lot of entries like
2
# 1578324	problematico, a, ci, che
3
# apply this algorithm:
4
#  treat things after comma as suffixes
5
#  for each suffix:
6
#      if single letter, replace last letter
7
#      else search backwards for beginning of suffix
8
#      and if it leads to an old suffix of approximately
9
#      the same length, put replace that suffix
10
# This will still leave some commas to fix by hand
11
# Usage: awk -F'	' -f comfix.awk rawindex > newrawindex
12
 
13
NF == 2	{
14
		i = index($2, ",")
15
		if(i == 0 || length($2) == 0)
16
			print $0
17
		else {
18
			n = split($2, a, /,[ ]*/)
19
			w = a[1]
20
			printf "%s\t%s\n", $1, w
21
			for(i = 2; i <= n; i++) {
22
				suf = a[i]
23
				m = matchsuflen(w, suf)
24
				if(m) {
25
					nw = substr(w, 1, length(w)-m) suf
26
					printf "%s\t%s\n", $1, nw
27
				} else
28
					printf "%s\t%s\n", $1, w ", " suf
29
			}
30
		}
31
	}
32
NF != 2 {
33
	print $0
34
	}
35
 
36
function matchsuflen(w, suf,		wlen,suflen,c,pat,k,d)
37
{
38
	wlen = length(w)
39
	suflen = length(suf)
40
	if(suflen == 1)
41
		return 1
42
	else {
43
		c = substr(suf, 1, 1)
44
		for (k = 1; k <= wlen ; k++)
45
			if(substr(w, wlen-k+1, 1) == c)
46
				break
47
		if(k > wlen)
48
			return 0
49
		d = k-suflen
50
		if(d < 0)
51
			d = -d
52
		if(d > 3)
53
			return 0
54
		return k
55
	}
56
}