1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- # when raw index has a lot of entries like
- # 1578324 problematico, a, ci, che
- # apply this algorithm:
- # treat things after comma as suffixes
- # for each suffix:
- # if single letter, replace last letter
- # else search backwards for beginning of suffix
- # and if it leads to an old suffix of approximately
- # the same length, put replace that suffix
- # This will still leave some commas to fix by hand
- # Usage: awk -F' ' -f comfix.awk rawindex > newrawindex
- NF == 2 {
- i = index($2, ",")
- if(i == 0 || length($2) == 0)
- print $0
- else {
- n = split($2, a, /,[ ]*/)
- w = a[1]
- printf "%s\t%s\n", $1, w
- for(i = 2; i <= n; i++) {
- suf = a[i]
- m = matchsuflen(w, suf)
- if(m) {
- nw = substr(w, 1, length(w)-m) suf
- printf "%s\t%s\n", $1, nw
- } else
- printf "%s\t%s\n", $1, w ", " suf
- }
- }
- }
- NF != 2 {
- print $0
- }
- function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
- {
- wlen = length(w)
- suflen = length(suf)
- if(suflen == 1)
- return 1
- else {
- c = substr(suf, 1, 1)
- for (k = 1; k <= wlen ; k++)
- if(substr(w, wlen-k+1, 1) == c)
- break
- if(k > wlen)
- return 0
- d = k-suflen
- if(d < 0)
- d = -d
- if(d > 3)
- return 0
- return k
- }
- }
|