/kf6-qt6/sdk/pology/lang/nn/exclusion/README

0001 # Exclusion dictionaries for Norwegian Nynorsk
0002 # ────────────────────────────────────────────
0003 #
0004 # [Note: This file is runnable using ‘sh’.]
0005 #
0006 # To generate the dictionaries, you need four files:
0007 #   fullform_nn-r103.txt, fullform_nn-r183.txt
0008 #   fullform_nn.dat, fullform_nb.dat
0009 #
0010 # The file fullform_nn-r103.txt and fullform_nn-r183.txt
0011 # should be a checkout of revision 103 and 183,
0012 # respectively, of the file fullform_nn.txt from ordbanken
0013 # https://savannah.nongnu.org/projects/ordbanken/
0014 # (Only revision 103 and earlier revisions contain
0015 # information on ‘klammeformer’, while only revision 183
0016 # and earlier revisions contain information on
0017 # ‘unormert’ words.)
0018 #
0019 # The .dat files are word list files from the *latest*
0020 # revision of ordbanken (compiled using the ‘make’ command).
0021
0022
0023 # The file ‘klammeformer.dat’ contains a list of all
0024 # words that were ‘klammeformer’ in the pre-2012
0025 # Norwegian Nynorsk ortography. It is generated
0026 # from ‘fullform_nn-r103.txt’ using these commands:
0027
0028 grep -v '^*' fullform_nn-r103.txt > alle.txt
0029 grep -Fv " klammeform" alle.txt > hovudformer.txt
0030
0031 cut -f3 -d'     ' alle.txt | sort | uniq > alle.dat
0032 cut -f3 -d'     ' hovudformer.txt | sort | uniq > hovudformer.dat
0033 comm -3 alle.dat hovudformer.dat > klammeformer.txt
0034
0035 rm -f alle.* hovudformer.*
0036
0037 # And ‘klammeformer.txt’ is further modified into
0038 # ‘klammeformer.dat’ later in the script, to remove
0039 # some spellings of nouns which are now allowed.
0040
0041
0042
0043 # The file ‘unormert.dat’ contains a list of all
0044 # words that were included in the latest
0045 # version of Norsk ordbank that contained information
0046 # on ‘unormert’ words but are not present in the
0047 # current version of Norsk ordbank. That is, it
0048 # contains a list of ‘unormert’ words and other
0049 # word which have been removed from the dictionary.
0050
0051 cut -f2 -d'     ' fullform_nn.dat | sort | uniq > alle.dat
0052 grep -v '^*' fullform_nn-r183.txt | cut -f3 -d' ' | sort | uniq > gamle.txt
0053 comm -13 alle.dat gamle.txt > unormert.txt
0054 comm -23 unormert.txt klammeformer.txt > unormert.dat
0055 rm -f alle.dat gamle.txt unormert.txt
0056
0057
0058
0059 # The file ‘bokmal-words.dat’ contains a list of all words that
0060 # are valid in Norwegian Bokmål, but not in Norwegian Nynorsk.
0061 # Words containing only non-letter characters or containing
0062 # uppercase letters are excluded, to avoid too many false
0063 # positives. Still, there will be *many* false positives, so
0064 # this file should not be used in any ‘default’ rule sets.
0065 # It is generated from ‘fullform_nn.dat’ and ‘fullform_bm.dat’
0066 # using these commands:
0067
0068 cut -f2 -d'     ' fullform_nn.dat | sed 's/ /\n/g' | grep -v '^[^A-Za-zÆØÅæøå]\+$' | grep -v '[A-ZÆØÅ]' | sort | uniq > ord-nn.txt
0069 cut -f2 -d'     ' fullform_nb.dat | sed 's/ /\n/g' | grep -v '^[^A-Za-zÆØÅæøå]\+$' | grep -v '[A-ZÆØÅ]' | sort | uniq > ord-nb.txt
0070
0071 comm -23 ord-nb.txt ord-nn.txt > bokmal-words.dat
0072
0073 rm -f ord-nn.txt ord-nb.txt
0074
0075
0076
0077 # The file ‘imperativfeil.dat’ contains a list of imperatives
0078 # misspelled with an accent. For example, it contains the
0079 # word ‘installér’ (should be spelled ‘installer’).
0080 grep -F 'verb   imp' fullform_nn.dat | awk -F'\t' '{ print $2 }' | \
0081 grep '^[^-].*er$' | sort -u | sed 's/er$/ér/' > imperativfeil.dat
0082
0083
0084 # The file ‘e-infinitiv.dat’ contains a list of all infinitives
0085 # ending in -e where there are no other word forms with the
0086 # exact same spelling. For example, it contains the word ‘lagre’
0087 # (should be spelled ‘lagra’ according to our translation guidelines),
0088 # but not the word ‘opne’, as ‘opne’ is also used as an adjective,
0089 # for example in ‘fleire opne program’. The file is generated from
0090 # ‘fullform_nn.dat’ using these commands:
0091
0092 awk -F'\t' '
0093 {
0094   ordklasse=$3
0095   tid=$4
0096   bokstav=substr($2,length($2),length($2))
0097   stamme=substr($2,1,length($2)-1)
0098   if( ((ordklasse=="verb") && (tid=="inf" || tid=="imp")) && bokstav=="a" && $1 == stamme "e")
0099     print $1
0100 }' fullform_nn.dat | sort -u > ea-inf.txt
0101 grep -v '       verb    i\(nf\|mp\)' fullform_nn.dat | cut -f2 -d'      ' | grep e$ | sort -u > e-ord.txt
0102 comm -23 ea-inf.txt e-ord.txt > e-infinitiv.dat
0103 rm -f a-inf.txt ea-inf.txt e-ord.txt
0104
0105
0106 # The file ‘subst-mask-er.dat’ contains a list of -er/-ene inflections of
0107 # masculine nouns that can have both a -ar/-ane or a -er/-ene suffix.
0108 # For example, the noun «gjest» can be written as gjestar/gjestane
0109 # or gjester/gjestene, so the output file contains ‘gjester’ and ‘gjestene’.
0110 # Note that the (masculine) noun ‘elv’ is removed, as it’s a rare word
0111 # and conflicts with the feminine ‘elv’.
0112 #
0113 # The file ‘subst-fem-ar.dat’ contains a list of -ar/-ane inflections of
0114 # feminine nouns that can have both a -ar/-ane or a -er/-ene suffix.
0115 # For example, the noun «sideelv» can be written as sideelver/sideelvene
0116 # or sideelvar/sideelvane, so the output file contains ‘sideelvar’ and
0117 # ‘sideelvane’.
0118 #
0119 # The files are generated from ‘fullform_nn.dat’ using these commands:
0120
0121 grep -F '       subst' fullform_nn.dat > subst-ok.dat
0122 awk -F'\t' '{ print $(NF-1) "\t" $0 }' subst-ok.dat | sort -t"  " -k2,2 -k1,1 -k5 -k3,3 > subst-ok-sort.dat
0123 grep -F '       mask    ' subst-ok-sort.dat | grep -v ' elv     .*mask' > subst-mask.dat
0124 grep -F '       fem     ' subst-ok-sort.dat > subst-fem.dat
0125
0126 awk -F'\t' '
0127 {
0128   if( $1==previd && ((substr($3,length($3)-2,length($3))=="ene" && prevw==substr($3,1,length($3)-3)"ane") ||
0129                      (substr($3,length($3)-1,length($3))=="er" && prevw==substr($3,1,length($3)-2)"ar")) )
0130   print $3
0131   previd=$1
0132   prevw=$3
0133 }' subst-mask.dat > subst-mask-er.dat
0134
0135 awk -F'\t' '
0136 {
0137   if( $1==previd && ((substr($3,length($3)-2,length($3))=="ane" && prevw==substr($3,1,length($3)-3)"ene") ||
0138                      (substr($3,length($3)-1,length($3))=="ar" && prevw==substr($3,1,length($3)-2)"er")) )
0139     print $3
0140     previd=$1
0141     prevw=$3
0142 }' subst-fem.dat > subst-fem-ar.dat
0143
0144 rm -f subst-ok* subst-mask.dat subst-fem.dat
0145
0146
0147 # The file ‘subst-mask-artikkel.dat’ contains a list of masculine
0148 # nouns but prefixed with the indefinite articles ‘ei’ and ‘eit’.
0149 # Only words that are not homographs with other words are included.
0150 # The files ‘subst-fem-artikkel.dat’ and ‘subst-noyt-artikkel.dat’
0151 # are similar, but for feminine and neuter words, respectively.
0152 #
0153 # The files are generated from ‘fullform_nn.dat’ using these commands:
0154
0155 grep -F '       subst   ' fullform_nn.dat > subst.tmp
0156 grep -F -v '    subst   ' fullform_nn.dat | awk -F'\t' '{ print $2 }' | sort -u > ikkje-subst.tmp
0157 grep -F '       eint    ub      ' subst.tmp > subst-grunnord.tmp
0158 grep -F '       subst   mask' subst-grunnord.tmp | awk -F'\t' '{ print $2 }' | sort -u > subst-mask.tmp
0159 grep -F '       subst   fem' subst-grunnord.tmp | awk -F'\t' '{ print $2 }'  | sort -u > subst-fem.tmp
0160 grep -F '       subst   nøyt' subst-grunnord.tmp | awk -F'\t' '{ print $2 }' | sort -u > subst-noyt.tmp
0161 cat ikkje-subst.tmp subst-fem.tmp subst-noyt.tmp | sort -u > ikkje-subst-mask.tmp
0162 cat ikkje-subst.tmp subst-mask.tmp subst-noyt.tmp | sort -u > ikkje-subst-fem.tmp
0163 cat ikkje-subst.tmp subst-mask.tmp subst-fem.tmp | sort -u > ikkje-subst-noyt.tmp
0164 comm subst-mask.tmp ikkje-subst-mask.tmp | awk -F'\t' '{if ($1) { print "ei " $1 "\neit " $1 }}' > subst-mask-artikkel.dat
0165 comm subst-fem.tmp ikkje-subst-fem.tmp  | awk -F'\t' '{if ($1) { print "ein " $1 "\neit " $1 }}' > subst-fem-artikkel.dat
0166 comm subst-noyt.tmp ikkje-subst-noyt.tmp | awk -F'\t' '{if ($1) { print "ein " $1 "\nei " $1 }}' > subst-noyt-artikkel.dat
0167 rm -f subst.tmp ikkje-subst.tmp subst-grunnord.tmp subst-mask.tmp subst-fem.tmp subst-noyt.tmp \
0168       ikkje-subst-mask.tmp ikkje-subst-fem.tmp ikkje-subst-noyt.tmp
0169
0170
0171
0172 # Remove some common (correct) spellings from the ‘klammeformer’
0173 # word list (-ar/-ane suffix on masculine nouns and -er/-ene
0174 # suffix on feminine nouns).
0175 sed 's/a\(r\|ne\)$/e\1/' subst-fem-ar.dat > words-ok.txt
0176 sed 's/e\(r\|ne\)$/a\1/' subst-mask-er.dat >> words-ok.txt
0177 sort words-ok.txt | uniq > words-ok.dat
0178 comm -23 klammeformer.txt words-ok.dat > klammeformer.dat
0179
0180 rm -f words-ok.txt words-ok.dat klammeformer.txt