Subversion Repositories planix.SVN

Rev

Rev 2 | Blame | Compare with Previous | Last modification | View Log | RSS feed

# when raw index has a lot of entries like
# 1578324       problematico, a, ci, che
# apply this algorithm:
#  treat things after comma as suffixes
#  for each suffix:
#      if single letter, replace last letter
#      else search backwards for beginning of suffix
#      and if it leads to an old suffix of approximately
#      the same length, put replace that suffix
# This will still leave some commas to fix by hand
# Usage: awk -F'        ' -f comfix.awk rawindex > newrawindex

NF == 2 {
                i = index($2, ",")
                if(i == 0 || length($2) == 0)
                        print $0
                else {
                        n = split($2, a, /,[ ]*/)
                        w = a[1]
                        printf "%s\t%s\n", $1, w
                        for(i = 2; i <= n; i++) {
                                suf = a[i]
                                m = matchsuflen(w, suf)
                                if(m) {
                                        nw = substr(w, 1, length(w)-m) suf
                                        printf "%s\t%s\n", $1, nw
                                } else
                                        printf "%s\t%s\n", $1, w ", " suf
                        }
                }
        }
NF != 2 {
        print $0
        }

function matchsuflen(w, suf,            wlen,suflen,c,pat,k,d)
{
        wlen = length(w)
        suflen = length(suf)
        if(suflen == 1)
                return 1
        else {
                c = substr(suf, 1, 1)
                for (k = 1; k <= wlen ; k++)
                        if(substr(w, wlen-k+1, 1) == c)
                                break
                if(k > wlen)
                        return 0
                d = k-suflen
                if(d < 0)
                        d = -d
                if(d > 3)
                        return 0
                return k
        }
}