comfix.awk 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. # when raw index has a lot of entries like
  2. # 1578324 problematico, a, ci, che
  3. # apply this algorithm:
  4. # treat things after comma as suffixes
  5. # for each suffix:
  6. # if single letter, replace last letter
  7. # else search backwards for beginning of suffix
  8. # and if it leads to an old suffix of approximately
  9. # the same length, put replace that suffix
  10. # This will still leave some commas to fix by hand
  11. # Usage: awk -F' ' -f comfix.awk rawindex > newrawindex
  12. NF == 2 {
  13. i = index($2, ",")
  14. if(i == 0 || length($2) == 0)
  15. print $0
  16. else {
  17. n = split($2, a, /,[ ]*/)
  18. w = a[1]
  19. printf "%s\t%s\n", $1, w
  20. for(i = 2; i <= n; i++) {
  21. suf = a[i]
  22. m = matchsuflen(w, suf)
  23. if(m) {
  24. nw = substr(w, 1, length(w)-m) suf
  25. printf "%s\t%s\n", $1, nw
  26. } else
  27. printf "%s\t%s\n", $1, w ", " suf
  28. }
  29. }
  30. }
  31. NF != 2 {
  32. print $0
  33. }
  34. function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
  35. {
  36. wlen = length(w)
  37. suflen = length(suf)
  38. if(suflen == 1)
  39. return 1
  40. else {
  41. c = substr(suf, 1, 1)
  42. for (k = 1; k <= wlen ; k++)
  43. if(substr(w, wlen-k+1, 1) == c)
  44. break
  45. if(k > wlen)
  46. return 0
  47. d = k-suflen
  48. if(d < 0)
  49. d = -d
  50. if(d > 3)
  51. return 0
  52. return k
  53. }
  54. }