Warning, /sdk/pology/lang/nn/exclusion/README is written in an unsupported language. File is not indexed.
0001 # Exclusion dictionaries for Norwegian Nynorsk 0002 # ──────────────────────────────────────────── 0003 # 0004 # [Note: This file is runnable using ‘sh’.] 0005 # 0006 # To generate the dictionaries, you need four files: 0007 # fullform_nn-r103.txt, fullform_nn-r183.txt 0008 # fullform_nn.dat, fullform_nb.dat 0009 # 0010 # The file fullform_nn-r103.txt and fullform_nn-r183.txt 0011 # should be a checkout of revision 103 and 183, 0012 # respectively, of the file fullform_nn.txt from ordbanken 0013 # https://savannah.nongnu.org/projects/ordbanken/ 0014 # (Only revision 103 and earlier revisions contain 0015 # information on ‘klammeformer’, while only revision 183 0016 # and earlier revisions contain information on 0017 # ‘unormert’ words.) 0018 # 0019 # The .dat files are word list files from the *latest* 0020 # revision of ordbanken (compiled using the ‘make’ command). 0021 0022 0023 # The file ‘klammeformer.dat’ contains a list of all 0024 # words that were ‘klammeformer’ in the pre-2012 0025 # Norwegian Nynorsk ortography. It is generated 0026 # from ‘fullform_nn-r103.txt’ using these commands: 0027 0028 grep -v '^*' fullform_nn-r103.txt > alle.txt 0029 grep -Fv " klammeform" alle.txt > hovudformer.txt 0030 0031 cut -f3 -d' ' alle.txt | sort | uniq > alle.dat 0032 cut -f3 -d' ' hovudformer.txt | sort | uniq > hovudformer.dat 0033 comm -3 alle.dat hovudformer.dat > klammeformer.txt 0034 0035 rm -f alle.* hovudformer.* 0036 0037 # And ‘klammeformer.txt’ is further modified into 0038 # ‘klammeformer.dat’ later in the script, to remove 0039 # some spellings of nouns which are now allowed. 0040 0041 0042 0043 # The file ‘unormert.dat’ contains a list of all 0044 # words that were included in the latest 0045 # version of Norsk ordbank that contained information 0046 # on ‘unormert’ words but are not present in the 0047 # current version of Norsk ordbank. That is, it 0048 # contains a list of ‘unormert’ words and other 0049 # word which have been removed from the dictionary. 0050 0051 cut -f2 -d' ' fullform_nn.dat | sort | uniq > alle.dat 0052 grep -v '^*' fullform_nn-r183.txt | cut -f3 -d' ' | sort | uniq > gamle.txt 0053 comm -13 alle.dat gamle.txt > unormert.txt 0054 comm -23 unormert.txt klammeformer.txt > unormert.dat 0055 rm -f alle.dat gamle.txt unormert.txt 0056 0057 0058 0059 # The file ‘bokmal-words.dat’ contains a list of all words that 0060 # are valid in Norwegian Bokmål, but not in Norwegian Nynorsk. 0061 # Words containing only non-letter characters or containing 0062 # uppercase letters are excluded, to avoid too many false 0063 # positives. Still, there will be *many* false positives, so 0064 # this file should not be used in any ‘default’ rule sets. 0065 # It is generated from ‘fullform_nn.dat’ and ‘fullform_bm.dat’ 0066 # using these commands: 0067 0068 cut -f2 -d' ' fullform_nn.dat | sed 's/ /\n/g' | grep -v '^[^A-Za-zÆØÅæøå]\+$' | grep -v '[A-ZÆØÅ]' | sort | uniq > ord-nn.txt 0069 cut -f2 -d' ' fullform_nb.dat | sed 's/ /\n/g' | grep -v '^[^A-Za-zÆØÅæøå]\+$' | grep -v '[A-ZÆØÅ]' | sort | uniq > ord-nb.txt 0070 0071 comm -23 ord-nb.txt ord-nn.txt > bokmal-words.dat 0072 0073 rm -f ord-nn.txt ord-nb.txt 0074 0075 0076 0077 # The file ‘imperativfeil.dat’ contains a list of imperatives 0078 # misspelled with an accent. For example, it contains the 0079 # word ‘installér’ (should be spelled ‘installer’). 0080 grep -F 'verb imp' fullform_nn.dat | awk -F'\t' '{ print $2 }' | \ 0081 grep '^[^-].*er$' | sort -u | sed 's/er$/ér/' > imperativfeil.dat 0082 0083 0084 # The file ‘e-infinitiv.dat’ contains a list of all infinitives 0085 # ending in -e where there are no other word forms with the 0086 # exact same spelling. For example, it contains the word ‘lagre’ 0087 # (should be spelled ‘lagra’ according to our translation guidelines), 0088 # but not the word ‘opne’, as ‘opne’ is also used as an adjective, 0089 # for example in ‘fleire opne program’. The file is generated from 0090 # ‘fullform_nn.dat’ using these commands: 0091 0092 awk -F'\t' ' 0093 { 0094 ordklasse=$3 0095 tid=$4 0096 bokstav=substr($2,length($2),length($2)) 0097 stamme=substr($2,1,length($2)-1) 0098 if( ((ordklasse=="verb") && (tid=="inf" || tid=="imp")) && bokstav=="a" && $1 == stamme "e") 0099 print $1 0100 }' fullform_nn.dat | sort -u > ea-inf.txt 0101 grep -v ' verb i\(nf\|mp\)' fullform_nn.dat | cut -f2 -d' ' | grep e$ | sort -u > e-ord.txt 0102 comm -23 ea-inf.txt e-ord.txt > e-infinitiv.dat 0103 rm -f a-inf.txt ea-inf.txt e-ord.txt 0104 0105 0106 # The file ‘subst-mask-er.dat’ contains a list of -er/-ene inflections of 0107 # masculine nouns that can have both a -ar/-ane or a -er/-ene suffix. 0108 # For example, the noun «gjest» can be written as gjestar/gjestane 0109 # or gjester/gjestene, so the output file contains ‘gjester’ and ‘gjestene’. 0110 # Note that the (masculine) noun ‘elv’ is removed, as it’s a rare word 0111 # and conflicts with the feminine ‘elv’. 0112 # 0113 # The file ‘subst-fem-ar.dat’ contains a list of -ar/-ane inflections of 0114 # feminine nouns that can have both a -ar/-ane or a -er/-ene suffix. 0115 # For example, the noun «sideelv» can be written as sideelver/sideelvene 0116 # or sideelvar/sideelvane, so the output file contains ‘sideelvar’ and 0117 # ‘sideelvane’. 0118 # 0119 # The files are generated from ‘fullform_nn.dat’ using these commands: 0120 0121 grep -F ' subst' fullform_nn.dat > subst-ok.dat 0122 awk -F'\t' '{ print $(NF-1) "\t" $0 }' subst-ok.dat | sort -t" " -k2,2 -k1,1 -k5 -k3,3 > subst-ok-sort.dat 0123 grep -F ' mask ' subst-ok-sort.dat | grep -v ' elv .*mask' > subst-mask.dat 0124 grep -F ' fem ' subst-ok-sort.dat > subst-fem.dat 0125 0126 awk -F'\t' ' 0127 { 0128 if( $1==previd && ((substr($3,length($3)-2,length($3))=="ene" && prevw==substr($3,1,length($3)-3)"ane") || 0129 (substr($3,length($3)-1,length($3))=="er" && prevw==substr($3,1,length($3)-2)"ar")) ) 0130 print $3 0131 previd=$1 0132 prevw=$3 0133 }' subst-mask.dat > subst-mask-er.dat 0134 0135 awk -F'\t' ' 0136 { 0137 if( $1==previd && ((substr($3,length($3)-2,length($3))=="ane" && prevw==substr($3,1,length($3)-3)"ene") || 0138 (substr($3,length($3)-1,length($3))=="ar" && prevw==substr($3,1,length($3)-2)"er")) ) 0139 print $3 0140 previd=$1 0141 prevw=$3 0142 }' subst-fem.dat > subst-fem-ar.dat 0143 0144 rm -f subst-ok* subst-mask.dat subst-fem.dat 0145 0146 0147 # The file ‘subst-mask-artikkel.dat’ contains a list of masculine 0148 # nouns but prefixed with the indefinite articles ‘ei’ and ‘eit’. 0149 # Only words that are not homographs with other words are included. 0150 # The files ‘subst-fem-artikkel.dat’ and ‘subst-noyt-artikkel.dat’ 0151 # are similar, but for feminine and neuter words, respectively. 0152 # 0153 # The files are generated from ‘fullform_nn.dat’ using these commands: 0154 0155 grep -F ' subst ' fullform_nn.dat > subst.tmp 0156 grep -F -v ' subst ' fullform_nn.dat | awk -F'\t' '{ print $2 }' | sort -u > ikkje-subst.tmp 0157 grep -F ' eint ub ' subst.tmp > subst-grunnord.tmp 0158 grep -F ' subst mask' subst-grunnord.tmp | awk -F'\t' '{ print $2 }' | sort -u > subst-mask.tmp 0159 grep -F ' subst fem' subst-grunnord.tmp | awk -F'\t' '{ print $2 }' | sort -u > subst-fem.tmp 0160 grep -F ' subst nøyt' subst-grunnord.tmp | awk -F'\t' '{ print $2 }' | sort -u > subst-noyt.tmp 0161 cat ikkje-subst.tmp subst-fem.tmp subst-noyt.tmp | sort -u > ikkje-subst-mask.tmp 0162 cat ikkje-subst.tmp subst-mask.tmp subst-noyt.tmp | sort -u > ikkje-subst-fem.tmp 0163 cat ikkje-subst.tmp subst-mask.tmp subst-fem.tmp | sort -u > ikkje-subst-noyt.tmp 0164 comm subst-mask.tmp ikkje-subst-mask.tmp | awk -F'\t' '{if ($1) { print "ei " $1 "\neit " $1 }}' > subst-mask-artikkel.dat 0165 comm subst-fem.tmp ikkje-subst-fem.tmp | awk -F'\t' '{if ($1) { print "ein " $1 "\neit " $1 }}' > subst-fem-artikkel.dat 0166 comm subst-noyt.tmp ikkje-subst-noyt.tmp | awk -F'\t' '{if ($1) { print "ein " $1 "\nei " $1 }}' > subst-noyt-artikkel.dat 0167 rm -f subst.tmp ikkje-subst.tmp subst-grunnord.tmp subst-mask.tmp subst-fem.tmp subst-noyt.tmp \ 0168 ikkje-subst-mask.tmp ikkje-subst-fem.tmp ikkje-subst-noyt.tmp 0169 0170 0171 0172 # Remove some common (correct) spellings from the ‘klammeformer’ 0173 # word list (-ar/-ane suffix on masculine nouns and -er/-ene 0174 # suffix on feminine nouns). 0175 sed 's/a\(r\|ne\)$/e\1/' subst-fem-ar.dat > words-ok.txt 0176 sed 's/e\(r\|ne\)$/a\1/' subst-mask-er.dat >> words-ok.txt 0177 sort words-ok.txt | uniq > words-ok.dat 0178 comm -23 klammeformer.txt words-ok.dat > klammeformer.dat 0179 0180 rm -f words-ok.txt words-ok.dat klammeformer.txt